videoDownloadTGbot/youtube-downloader/app.py

"""
YouTube Video Downloader Service
Отдельный микросервис для скачивания видео с YouTube

Версия 2: subprocess-based yt-dlp CLI (обход SSL бага в gunicorn pre-fork)
"""
import os
import time
import logging
import traceback
import subprocess
import json as json_lib
from pathlib import Path
from flask import Flask, request, jsonify
from flask_cors import CORS
import uuid
import re

# Настройка логирования
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)

app = Flask(__name__)
CORS(app)

# Директория для временных файлов
DOWNLOADS_DIR = Path('downloads')
DOWNLOADS_DIR.mkdir(exist_ok=True)


def _safe_filename(title: str) -> str:
    """Создает безопасное имя файла"""
    safe_title = re.sub(r'[<>:"/\\|?*]', '', title)[:100]
    return str(DOWNLOADS_DIR / f'{uuid.uuid4()}_{safe_title}.%(ext)s')


def _cleanup_downloads():
    """Удаляет все файлы из папки загрузок"""
    for f in DOWNLOADS_DIR.glob('*'):
        try:
            f.unlink()
        except Exception:
            pass


def _find_latest_downloaded() -> Path | None:
    """Возвращает самый свежий файл в папке загрузок (не .part/.ytdl)."""
    files = [f for f in DOWNLOADS_DIR.glob('*')
             if f.suffix not in ('.part', '.ytdl')]
    if not files:
        return None
    files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    return files[0]


def _file_has_video_stream(filepath: Path) -> bool:
    """Проверяет через ffprobe, содержит ли файл видео-поток."""
    try:
        result = subprocess.run(
            ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
             '-show_entries', 'stream=codec_type', '-of', 'csv=p=0',
             str(filepath)],
            capture_output=True, text=True, timeout=15
        )
        return result.stdout.strip() == 'video'
    except Exception as e:
        logger.warning(f"[VALIDATE] Не удалось проверить видео-поток в {filepath.name}: {e}")
        return True


def _file_has_audio_stream(filepath: Path) -> bool:
    """Проверяет через ffprobe, содержит ли файл аудио-поток."""
    try:
        result = subprocess.run(
            ['ffprobe', '-v', 'error', '-select_streams', 'a:0',
             '-show_entries', 'stream=codec_type', '-of', 'csv=p=0',
             str(filepath)],
            capture_output=True, text=True, timeout=15
        )
        return result.stdout.strip() == 'audio'
    except Exception:
        return False


def _find_video_file() -> Path | None:
    """Находит видеофайл среди загрузок. Если видео+аудио раздельные — мержит ffmpeg."""
    files = [f for f in DOWNLOADS_DIR.glob('*') if f.suffix not in ('.part', '.ytdl')]
    if not files:
        return None
    files.sort(key=lambda x: x.stat().st_mtime, reverse=True)

    video_file = None
    audio_file = None

    for f in files:
        if _file_has_video_stream(f):
            if _file_has_audio_stream(f):
                return f  # combined stream
            if video_file is None:
                video_file = f
        elif not audio_file and _file_has_audio_stream(f):
            audio_file = f

    if video_file is None:
        return None

    if audio_file:
        merged = DOWNLOADS_DIR / f"{video_file.stem}_merged{video_file.suffix}"
        logger.info(f"[MERGE] Мержим {video_file.name} + {audio_file.name} -> {merged.name}")
        result = subprocess.run(
            ['ffmpeg', '-y', '-i', str(video_file), '-i', str(audio_file),
             '-c', 'copy', '-map', '0:v:0', '-map', '1:a:0', str(merged)],
            capture_output=True, text=True, timeout=120
        )
        if result.returncode == 0:
            video_file.unlink(missing_ok=True)
            audio_file.unlink(missing_ok=True)
            return merged
        logger.error(f"[MERGE] Ошибка ffmpeg: {result.stderr[-300:]}")
        return video_file

    return video_file


# ═══════════════════════════════════════════════════════════════
#  CORE: subprocess-based yt-dlp
# ═══════════════════════════════════════════════════════════════

YTDLP_CMD = 'yt-dlp'
DOWNLOAD_TIMEOUT = 300
INFO_TIMEOUT = 60

PLAYER_CLIENTS = 'web,android'
EXTRACTOR_ARGS = 'youtube:player_client=web,android:skip=translated_subs,hls'


def _build_ytdlp_base_cmd() -> list:
    """Базовые аргументы yt-dlp CLI."""
    cookies_file = Path(os.getenv('YOUTUBE_COOKIES_FILE', '/app/youtube_cookies.txt'))
    cmd = [
        YTDLP_CMD,
        '--socket-timeout', '15',
        '--extractor-args', EXTRACTOR_ARGS,
        '--js-runtimes', 'node',
        '--remote-components', 'ejs:github',
        '--no-playlist',
    ]
    if cookies_file.exists() and cookies_file.stat().st_size > 0:
        cmd += ['--cookies', str(cookies_file.absolute())]
    return cmd


def _run_ytdlp(args: list, timeout: int = DOWNLOAD_TIMEOUT) -> subprocess.CompletedProcess:
    """Запускает yt-dlp CLI как subprocess (чистый SSL стек)."""
    logger.info(f"[YTDLP] {' '.join(args)}")
    return subprocess.run(args, capture_output=True, text=True, timeout=timeout)


# ═══════════════════════════════════════════════════════════════
#  YouTube formatter parser (shared with old codebase)
# ═══════════════════════════════════════════════════════════════

def _parse_height(format_dict: dict) -> int:
    """Извлекает реальную высоту из формата."""
    h = format_dict.get('height')
    w = format_dict.get('width')
    if h and w and isinstance(h, (int, float)) and isinstance(w, (int, float)):
        return min(int(h), int(w))
    if h and isinstance(h, (int, float)) and h > 0:
        return int(h)
    if w and isinstance(w, (int, float)) and w > 0:
        return int(w)
    note = str(format_dict.get('format_note', '') or '')
    match = re.search(r'(\d+)\s*p', note)
    if match:
        return int(match.group(1))
    match = re.search(r'(\d+)\s*x\s*(\d+)', note, re.IGNORECASE)
    if match:
        return min(int(match.group(1)), int(match.group(2)))
    res = str(format_dict.get('resolution', '') or '')
    match = re.search(r'(\d+)\s*x\s*(\d+)', res, re.IGNORECASE)
    if match:
        return min(int(match.group(1)), int(match.group(2)))
    return 0


# ═══════════════════════════════════════════════════════════════
#  Форматы (--dump-json)
# ═══════════════════════════════════════════════════════════════

def get_youtube_formats(url: str) -> list[dict]:
    """Получает список доступных форматов через subprocess yt-dlp --dump-json."""
    logger.info(f"[FORMATS] Получение списка форматов для: {url}")

    cmd = _build_ytdlp_base_cmd() + ['--dump-json', '--quiet', '--no-warnings', url]
    try:
        result = _run_ytdlp(cmd, timeout=INFO_TIMEOUT)
    except Exception as e:
        logger.error(f"[FORMATS] Ошибка subprocess: {e}")
        raise Exception(f"Не удалось получить информацию о видео: {e}")

    if result.returncode != 0:
        err = result.stderr.strip()[-500:]
        logger.error(f"[FORMATS] yt-dlp failed: {err}")
        raise Exception(f"yt-dlp error: {err}")

    try:
        info = json_lib.loads(result.stdout)
    except json_lib.JSONDecodeError as e:
        raise Exception(f"Failed to parse --dump-json: {e}")

    formats = info.get('formats', [])
    duration = info.get('duration')
    logger.info(f"[FORMATS] Всего форматов: {len(formats)}, длительность: {duration}с")

    def _get_filesize(f: dict) -> int:
        size = f.get('filesize') or f.get('filesize_approx') or 0
        if size:
            return size
        if duration:
            tbr = f.get('tbr') or 0
            if tbr:
                return int(tbr * 1024 / 8 * duration)
            vbr = f.get('vbr') or 0
            abr = f.get('abr') or 0
            if vbr or abr:
                return int((vbr + abr) * 1024 / 8 * duration)
        return 0

    quality_tiers = [
        (2160, '4K'), (1440, '1440p'), (1080, '1080p'), (720, '720p'),
        (480, '480p'), (360, '360p'), (240, '240p'), (144, '144p'),
    ]

    available_heights = set()
    best_audio_info = {'size': 0, 'ext': 'm4a', 'format_id': None}

    for f in formats:
        vcodec = f.get('vcodec', 'none')
        acodec = f.get('acodec', 'none')
        height = _parse_height(f)
        if vcodec != 'none' and height > 0:
            available_heights.add(height)
        if vcodec == 'none' and acodec != 'none' and best_audio_info['format_id'] is None:
            best_audio_info = {'size': _get_filesize(f), 'ext': f.get('ext', 'm4a'),
                               'format_id': f.get('format_id', '')}

    max_actual_height = max(available_heights) if available_heights else 2160
    result = []
    used_heights = set()

    for max_height, label in quality_tiers:
        if max_height > max_actual_height:
            continue

        best_video = None
        best_video_height = 0
        is_best_dash = False

        for f in formats:
            vcodec = f.get('vcodec', 'none')
            height = _parse_height(f)
            if vcodec == 'none' or height <= 0 or height > max_height:
                continue
            is_dash = (f.get('acodec', 'none') == 'none')
            pick = False
            if height > best_video_height:
                pick = True
            elif height == best_video_height and is_dash and not is_best_dash:
                pick = True
            if pick:
                best_video = f
                best_video_height = height
                is_best_dash = is_dash

        if not best_video or best_video_height in used_heights:
            continue
        used_heights.add(best_video_height)

        video_size = _get_filesize(best_video)
        has_audio = best_video.get('acodec', 'none') != 'none'
        total_size = video_size + (best_audio_info['size'] if not has_audio else 0)
        video_ext = best_video.get('ext', 'mp4')
        video_format_id = best_video.get('format_id', '')

        format_note = best_video.get('format_note', '') or ''
        if format_note and str(best_video_height) in format_note:
            display_label = format_note
        else:
            display_label = f"{best_video_height}p"

        if has_audio:
            format_selector = f"{video_format_id}/best[height<={best_video_height}]/best"
        elif best_audio_info['format_id']:
            format_selector = (
                f"{video_format_id}+{best_audio_info['format_id']}/"
                f"bestvideo[height<={best_video_height}]+bestaudio/"
                f"best[height<={best_video_height}]"
            )
        else:
            format_selector = f"{video_format_id}+bestaudio/best[height<={best_video_height}]/best"

        result.append({
            'format_id': format_selector,
            'label': f"{display_label} ({video_ext})",
            'quality': display_label,
            'ext': video_ext,
            'filesize_mb': round(total_size / 1024 / 1024, 1) if total_size else None,
        })

    if best_audio_info['size']:
        result.append({
            'format_id': 'bestaudio/best',
            'label': f"Audio only ({best_audio_info['ext']})",
            'quality': 'audio',
            'ext': best_audio_info['ext'],
            'filesize_mb': round(best_audio_info['size'] / 1024 / 1024, 1) if best_audio_info['size'] else None,
        })

    # Fallsback: если форматов нет — оценочные
    if len(result) == 0:
        logger.info(f"[FORMATS] Реальных форматов не найдено, генерируем оценочные")
        max_possible_height = max_actual_height
        if duration:
            typical_bitrates = {2160: 40000, 1440: 20000, 1080: 10000, 720: 5000,
                                480: 2500, 360: 1200, 240: 600, 144: 300}
            for max_height, label in quality_tiers:
                if max_height > max_possible_height:
                    continue
                video_kbps = typical_bitrates.get(max_height, 1000)
                total_kbps = video_kbps + 128
                bytes_est = total_kbps * 1000 / 8 * duration
                result.append({
                    'format_id': f"bestvideo[height<={max_height}]+bestaudio/best[height<={max_height}]",
                    'label': f"{label} (mp4)",
                    'quality': label,
                    'ext': 'mp4',
                    'filesize_mb': round(bytes_est / 1024 / 1024, 1),
                })
            audio_bytes = 128 * 1000 / 8 * duration
            result.append({
                'format_id': 'bestaudio/best',
                'label': 'Audio only (m4a)',
                'quality': 'audio',
                'ext': 'm4a',
                'filesize_mb': round(audio_bytes / 1024 / 1024, 1),
            })

    logger.info(f"[FORMATS] Возвращаем {len(result)} форматов")
    return result


# ═══════════════════════════════════════════════════════════════
#  Скачивание (subprocess yt-dlp CLI)
# ═══════════════════════════════════════════════════════════════

def download_youtube_video(url: str, max_retries: int = 3, format_id: str | None = None) -> tuple[Path, str]:
    """Скачивает видео через subprocess yt-dlp CLI.
    Возвращает (путь_к_файлу, 'cli')."""
    logger.info(f"[DOWNLOAD] Начало скачивания: {url} (format={format_id})")

    if not format_id:
        # Fallback chain через yt-dlp format selector
        format_id = 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best'

    safe_tmpl = str(DOWNLOADS_DIR / f'%(title)s_%(id)s.%(ext)s')

    for attempt in range(max_retries):
        _cleanup_downloads()

        cmd = _build_ytdlp_base_cmd() + [
            '--downloader', 'aria2c',
            '--downloader-args',
            'aria2c:--connect-timeout=15 --timeout=120 --max-tries=1',
            '-f', format_id,
            '-o', safe_tmpl,
            url,
        ]

        try:
            result = _run_ytdlp(cmd, timeout=DOWNLOAD_TIMEOUT)
        except subprocess.TimeoutExpired:
            logger.error(f"[DOWNLOAD] yt-dlp timeout ({DOWNLOAD_TIMEOUT}s)")
            if attempt < max_retries - 1:
                time.sleep((attempt + 1) * 2)
                continue
            raise Exception(f"Превышен таймаут скачивания ({DOWNLOAD_TIMEOUT}с)")

        if result.returncode == 0:
            for line in result.stdout.split('\n'):
                if 'Destination:' in line:
                    logger.info(f"[DOWNLOAD] {line.strip()}")

            file = _find_latest_downloaded()
            if file:
                logger.info(f"[DOWNLOAD] Скачан файл: {file.name} ({file.stat().st_size} bytes)")
                return file, 'cli'

            logger.error("[DOWNLOAD] Файл не найден после успешного yt-dlp")
            raise Exception("Файл не найден после скачивания")

        # Обработка ошибок
        stderr = result.stderr.strip()[-800:]
        logger.error(f"[DOWNLOAD] Попытка {attempt + 1}: yt-dlp failed: {stderr[:300]}")

        # Try without cookies on cookies-related errors
        if ('cookies' in stderr.lower() or 'bot' in stderr.lower() or 'sign in' in stderr.lower()) \
                and '--cookies' in ' '.join(cmd):
            logger.warning("[DOWNLOAD] Пробуем без cookies")
            cmd_no_cookies = [a for i, a in enumerate(cmd) if a != '--cookies' and cmd[i-1] != '--cookies']
            try:
                result2 = _run_ytdlp(cmd_no_cookies, timeout=DOWNLOAD_TIMEOUT)
                if result2.returncode == 0:
                    file = _find_latest_downloaded()
                    if file:
                        return file, 'cli-no-cookies'
            except Exception:
                pass

        if attempt < max_retries - 1:
            time.sleep((attempt + 1) * 2)

    raise Exception(f"Не удалось скачать видео после {max_retries} попыток")


# ═══════════════════════════════════════════════════════════════
#  Кэш форматов
# ═══════════════════════════════════════════════════════════════

_formats_cache: dict[str, tuple[float, list[dict]]] = {}
_FORMATS_CACHE_TTL = 30 * 60  # 30 минут


def _normalize_youtube_url(url: str) -> str:
    m = re.search(r'(youtu\.be/|youtube\.com/watch\?v=)([a-zA-Z0-9_-]{11})', url)
    if m:
        prefix, video_id = m.group(1), m.group(2)
        return f"https://www.youtube.com/watch?v={video_id}"
    return url


# ═══════════════════════════════════════════════════════════════
#  Flask endpoints
# ═══════════════════════════════════════════════════════════════

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'ok', 'service': 'youtube-downloader'}), 200


@app.route('/formats', methods=['POST'])
def formats():
    request_id = str(uuid.uuid4())[:8]
    logger.info(f"[FORMATS {request_id}] ========== ЗАПРОС ФОРМАТОВ ==========")

    try:
        data = request.get_json()
        if not data or 'url' not in data:
            return jsonify({'error': 'URL is required'}), 400

        url = data['url']
        if 'youtube.com' not in url and 'youtu.be' not in url:
            return jsonify({'error': 'Only YouTube URLs are supported'}), 400

        cache_key = _normalize_youtube_url(url)
        now = time.time()

        if cache_key in _formats_cache:
            cached_time, cached_formats = _formats_cache[cache_key]
            if now - cached_time < _FORMATS_CACHE_TTL:
                logger.info(f"[FORMATS {request_id}] Кэш: {len(cached_formats)} форматов ({now - cached_time:.0f}с)")
                return jsonify({'formats': cached_formats}), 200
            del _formats_cache[cache_key]

        format_list = get_youtube_formats(url)
        _formats_cache[cache_key] = (time.time(), format_list)
        return jsonify({'formats': format_list}), 200

    except Exception as e:
        logger.error(f"[FORMATS {request_id}] Ошибка: {e}")
        logger.error(traceback.format_exc())
        return jsonify({'error': str(e)}), 500


@app.route('/download/stream', methods=['POST'])
def download_stream():
    request_id = str(uuid.uuid4())[:8]
    logger.info(f"[REQUEST {request_id}] ========== НОВЫЙ ЗАПРОС ==========")

    try:
        data = request.get_json()
        if not data or 'url' not in data:
            return jsonify({'error': 'URL is required'}), 400

        url = data['url']
        format_id = data.get('format_id')
        logger.info(f"[REQUEST {request_id}] Скачивание: {url}, format_id={format_id}")

        if 'youtube.com' not in url and 'youtu.be' not in url:
            return jsonify({'error': 'Only YouTube URLs are supported'}), 400

        video_path, used_downloader = download_youtube_video(url, format_id=format_id)
        logger.info(f"[REQUEST {request_id}] Видео скачано: {video_path} ({used_downloader})")

        file_size = video_path.stat().st_size
        with open(video_path, 'rb') as f:
            video_data = f.read()

        safe_filename = video_path.name.encode('ascii', 'ignore').decode('ascii') or 'youtube_video.mp4'
        if not any(safe_filename.endswith(ext) for ext in ('.mp4', '.webm', '.mkv', '.m4a', '.mp3')):
            safe_filename = 'youtube_video.mp4'

        ext = video_path.suffix.lower()
        content_type_map = {
            '.webm': 'video/webm', '.mkv': 'video/x-matroska',
            '.mp4': 'video/mp4', '.m4a': 'audio/mp4', '.mp3': 'audio/mpeg',
        }
        content_type = content_type_map.get(ext, 'video/mp4')

        video_path.unlink()
        logger.info(f"[REQUEST {request_id}] ========== ЗАПРОС УСПЕШНО ЗАВЕРШЕН ==========")

        return video_data, 200, {
            'Content-Type': content_type,
            'Content-Disposition': f'attachment; filename="{safe_filename}"'
        }

    except Exception as e:
        error_str = str(e)
        logger.error(f"[REQUEST {request_id}] ========== ОШИБКА ==========")
        logger.error(f"[REQUEST {request_id}] {error_str}")
        logger.error(traceback.format_exc())

        if any(kw in error_str.lower() for kw in ('cookies', 'bot', 'sign in', 'authentication')):
            error_msg = (
                f"{error_str}\n\n"
                "💡 Совет: Cookies устарели или недействительны. "
                "Обновите cookies через скрипт:\n"
                "  ./youtube-downloader/get_youtube_cookies.sh\n"
                "Затем перезапустите сервис."
            )
        else:
            error_msg = error_str

        return jsonify({'error': error_msg}), 500


if __name__ == '__main__':
    port = int(os.getenv('PORT', 5000))
    host = os.getenv('HOST', '0.0.0.0')
    logger.info(f"Запуск YouTube Downloader сервиса на {host}:{port}")
    app.run(host=host, port=port, debug=False)