#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
MejorTorrent Scraper + OMDb Enrichment + Dedupe + (Opcional) Transmission/Telegram

- Extrae: id, title, year, description, page_url, poster_url, torrent/magnet, infohash
- Enriquecido OMDb: rating_imdb, votes_imdb, imdb_id, genres[], runtime_min, poster mejorado
- Dedupe: SQLite por infohash (o hash del enlace si no hay)
"""

import io, re, time, json, hashlib, sqlite3, os
from dataclasses import dataclass, asdict, field
from typing import Iterable, List, Optional, Set
from urllib.parse import urljoin, urlparse, parse_qs, quote_plus

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup, NavigableString

# ========================= CONFIG =========================
BASE = "https://www36.mejortorrent.eu"
LISTING = BASE + "/peliculas?page={n}"

TIMEOUT = 20
PAGE_DELAY = 1.0
DETAIL_DELAY = 1
MAX_PAGES = 5              # ← cuántas páginas del listado recorrer

# Dedupe DB (usa uno propio para esta web)
DB_FILE = "torrents_seen_mejortorrent.sqlite3"

# Transmission (pip install transmission-rpc)
SEND_TO_TRANSMISSION = True
TR_HOST = "localhost"
TR_PORT = 9091
TR_USER = "transmission"
TR_PASS = "transmission"
TR_DOWNLOAD_DIR = "/mnt/hdd"

# Telegram (opcional)
SEND_TO_TELEGRAM = False
TELEGRAM_TOKEN   = ""
TELEGRAM_CHAT_ID = ""
TG_MIN_DELAY_SEC = 4
TG_IMG_MAX_BYTES = 5 * 1024 * 1024

# OMDb (IMPRESCINDIBLE para enriquecer)
OMDB_API_KEY = os.environ.get("OMDB_API_KEY", "f6b74a22")
# =========================================================

UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/124.0 Safari/537.36")

# Solo URLs de detalle válidas: /pelicula/<id>/<slug>
DETAIL_HREF_RE = re.compile(r"/pelicula/\d+/[A-Za-z0-9\-\_]+", re.I)

MAGNET_RE      = re.compile(r"^magnet:\?xt=", re.I)
TORRENT_EXT_RE = re.compile(r"\.torrent($|\?)", re.I)
YEAR_RE        = re.compile(r"(19\d{2}|20\d{2})")

# Limpia " | MejorTorrent ..." al final de títulos
SITE_SUFFIX_RE = re.compile(r"\s*\|\s*Mejor[Tt]orrent.*$", re.I)

LABEL_DESC_RE  = re.compile(r"^\s*(Sinopsis|Descripción)\s*[:：]?\s*$", re.I)

@dataclass
class Movie:
    id: str
    title: str
    year: Optional[int]
    description: str
    author: str
    page_url: str
    poster_url: str
    torrent: str
    infohash: str
    # Campos enriquecidos (OMDb)
    rating_imdb: Optional[float] = None
    votes_imdb: Optional[int] = None
    imdb_id: Optional[str] = None
    genres: List[str] = field(default_factory=list)
    runtime_min: Optional[int] = None

# =============== HTTP session with retries ===============
def make_session():
    s = requests.Session()
    retry = Retry(
        total=5, connect=5, read=5,
        backoff_factor=0.6,
        allowed_methods=frozenset({"GET", "POST"}),
        status_forcelist=[429, 500, 502, 503, 504, 520, 521, 522, 524, 525],
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retry)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update({"User-Agent": UA})
    return s

SESSION = make_session()

def get_soup(url: str) -> BeautifulSoup:
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

# ===================== Helpers ===========================
def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").replace("\xa0", " ")).strip()

def parse_year_any(text: str) -> Optional[int]:
    m = YEAR_RE.search(text or "")
    return int(m.group(1)) if m else None

def clean_title(raw: str) -> str:
    t = normalize_space(raw)
    t = SITE_SUFFIX_RE.sub("", t)  # quita " | MejorTorrent ..."
    # Quita prefijos/sufijos típicos de descarga
    t = re.sub(r"^\s*Descargar\s+", "", t, flags=re.I)
    t = re.sub(r"\s+por\s+Torrent\s*$", "", t, flags=re.I)
    t = re.sub(r"\s*Torrent\s+Gratis\s*$", "", t, flags=re.I)
    # Quita "Año 2024" o "(2024)" al final
    t = re.sub(r"\s*\(?\s*Año\s+\d{4}\s*\)?\s*$", "", t, flags=re.I)
    t = re.sub(r"\s*[-–—,]*\s*(?:\(|\[)?(19|20)\d{2}(?:\)|\])\s*$", "", t)
    return normalize_space(t).strip(" -–—·")

def pick_meta(soup: BeautifulSoup, key: str, attr: str = "property") -> Optional[str]:
    tag = soup.find("meta", attrs={attr: key})
    if tag and tag.get("content"):
        return normalize_space(tag["content"])
    return None

def text_after_label_in_p(p_tag: BeautifulSoup, label_tag) -> str:
    parts: List[str] = []
    for sib in label_tag.next_siblings:
        if isinstance(sib, NavigableString):
            parts.append(str(sib))
        else:
            parts.append(sib.get_text(" ", strip=True))
    return normalize_space(" ".join(parts))

def pick_description(soup: BeautifulSoup) -> str:
    # 1) p con <b>/<strong> = "Sinopsis"/"Descripción"
    for p in soup.find_all("p"):
        lab = p.find(["b", "strong"])
        if lab and LABEL_DESC_RE.match(normalize_space(lab.get_text(" ", strip=True))):
            desc = text_after_label_in_p(p, lab)
            if not desc:
                desc = re.sub(LABEL_DESC_RE, "", p.get_text(" ", strip=True))
                desc = normalize_space(desc)
            if len(desc) >= 5:
                return desc
    # 2) Bloques con texto “Sinopsis:” o “Descripción:”
    lbl = soup.find(string=LABEL_DESC_RE)
    if lbl:
        parent = getattr(lbl, "parent", None)
        if parent and parent.name in ("b", "strong"):
            p = parent.parent if (parent.parent and parent.parent.name == "p") else None
            if p:
                desc = text_after_label_in_p(p, parent)
                if len(desc) >= 5:
                    return desc
        txt = re.sub(LABEL_DESC_RE, "", str(lbl))
        txt = normalize_space(txt)
        if len(txt) >= 5:
            return txt
    # 3) Metas
    meta = pick_meta(soup, "og:description") or pick_meta(soup, "description", attr="name")
    if meta and len(meta) >= 8:
        return meta
    # 4) Primer párrafo decente
    container = soup.find(["article", "main"]) or soup
    p = container.find("p")
    if p:
        txt = normalize_space(p.get_text(" ", strip=True))
        if len(txt) >= 20:
            return txt
    return ""

def looks_like_poster(url: str) -> bool:
    if not url:
        return False
    u = url.lower()
    if "qr" in u or u.endswith(".svg"):
        return False
    return any(u.endswith(ext) for ext in (".jpg", ".jpeg", ".png", ".webp"))

def pick_poster_url(soup: BeautifulSoup, base_url: str) -> str:
    og = pick_meta(soup, "og:image")
    if og and looks_like_poster(og):
        return urljoin(base_url, og)
    cand = soup.select_one(".poster img, .cover img, .dt_post img, article img, .media img")
    if cand and cand.get("src") and looks_like_poster(cand["src"]):
        return urljoin(base_url, cand["src"])
    for img in soup.find_all("img", src=True):
        src = img["src"].strip()
        if looks_like_poster(src):
            return urljoin(base_url, src)
    return ""

# =================== Infohash helpers ====================
def infohash_from_magnet(magnet: str) -> Optional[str]:
    try:
        q = parse_qs(urlparse(magnet).query)
        for xt in q.get("xt", []):
            m = re.search(r"urn:btih:([A-Za-z0-9]{32}|[A-Fa-f0-9]{40})", xt)
            if m:
                h = m.group(1)
                if len(h) == 32 and re.fullmatch(r"[A-Z2-7]{32}", h, flags=re.I):
                    import base64
                    try:
                        raw = base64.b32decode(h.upper())
                        return raw.hex().upper()
                    except Exception:
                        return h.upper()
                return h.upper()
    except Exception:
        pass
    return None

def bdecode(src: bytes, i: int = 0):
    ch = src[i:i+1]
    if ch == b'i':
        j = src.index(b'e', i); return int(src[i+1:j]), j+1
    if ch == b'l':
        i += 1; out=[]
        while src[i:i+1] != b'e':
            v,i = bdecode(src,i); out.append(v)
        return out, i+1
    if ch == b'd':
        i += 1; d={}
        while src[i:i+1] != b'e':
            k,i = bdecode(src,i)
            v,i = bdecode(src,i)
            d[k]=v
        return d, i+1
    j = src.index(b':', i); ln = int(src[i:j])
    start = j+1; end = start+ln
    return src[start:end], end

def infohash_from_torrent_bytes(data: bytes) -> Optional[str]:
    try:
        obj, _ = bdecode(data, 0)
        if not isinstance(obj, dict): return None
        info = obj.get(b'info')
        if info is None: return None
        try:
            idx = data.find(b'4:info')
            if idx != -1:
                val, end = bdecode(data, idx + len(b'4:info'))
                seg = data[idx + len(b'4:info'): end]
                return hashlib.sha1(seg).hexdigest().upper()
        except Exception:
            pass
        def bencode(x) -> bytes:
            if isinstance(x, int): return b"i%de" % x
            if isinstance(x, bytes): return str(len(x)).encode()+b":"+x
            if isinstance(x, str):
                b = x.encode(); return str(len(b)).encode()+b":"+b
            if isinstance(x, list): return b"l"+b"".join(bencode(i) for i in x)+b"e"
            if isinstance(x, dict):
                items = sorted(x.items())
                return b"d"+b"".join(bencode(k)+bencode(v) for k,v in items)+b"e"
            raise TypeError("bad type")
        return hashlib.sha1(bencode(info)).hexdigest().upper()
    except Exception:
        return None

def get_infohash_from_url(url: str) -> Optional[str]:
    if MAGNET_RE.match(url):
        return infohash_from_magnet(url)
    try:
        r = SESSION.get(url, timeout=TIMEOUT)
        r.raise_for_status()
        return infohash_from_torrent_bytes(r.content)
    except Exception as e:
        print(f"[WARN] No se pudo calcular infohash del .torrent: {e}")
        return None

# =================== Dedupe (SQLite) =====================
def db_init(path: str = DB_FILE) -> sqlite3.Connection:
    conn = sqlite3.connect(path)
    conn.execute("""CREATE TABLE IF NOT EXISTS seen (
        infohash TEXT PRIMARY KEY,
        url TEXT,
        added_at TEXT DEFAULT (datetime('now'))
    )""")
    return conn

def db_seen(conn: sqlite3.Connection, infohash: str) -> bool:
    cur = conn.execute("SELECT 1 FROM seen WHERE infohash=?", (infohash,))
    return cur.fetchone() is not None

def db_mark(conn: sqlite3.Connection, infohash: str, url: str):
    try:
        conn.execute("INSERT OR IGNORE INTO seen(infohash,url) VALUES(?,?)", (infohash, url))
        conn.commit()
    except sqlite3.Error:
        pass

# =================== Transmission ========================
def tr_connect():
    if not SEND_TO_TRANSMISSION:
        return None
    try:
        from transmission_rpc import Client
        c = Client(host=TR_HOST, port=TR_PORT, username=TR_USER, password=TR_PASS, timeout=20)
        _ = c.session_stats()
        return c
    except Exception as e:
        print(f"[WARN] No se pudo conectar a Transmission: {e}")
        return None

def tr_add(client, url: str, download_dir: str = "") -> bool:
    if not client: return False
    try:
        kwargs = {}
        if download_dir: kwargs["download_dir"] = download_dir
        client.add_torrent(url, **kwargs)
        return True
    except Exception as e:
        print(f"[WARN] Error añadiendo a Transmission: {e}")
        return False

# =================== Telegram ============================
_last_tg_sent_ts = 0

def html_escape(s: str) -> str:
    return (s or "").replace("&","&amp;").replace("<","&lt;").replace(">","&gt;")

def _tg_wait_min_delay():
    global _last_tg_sent_ts
    wait = TG_MIN_DELAY_SEC - (time.time() - _last_tg_sent_ts)
    if wait > 0:
        time.sleep(wait)

def _tg_caption(m: Movie) -> str:
    head = f"🎬 <b>{html_escape(m.title)}</b> ({m.year or 's/f'})"
    desc = html_escape(m.description or "")
    if len(desc) > 900:
        desc = desc[:900].rsplit(" ", 1)[0] + "…"
    return head + ("\n\n" + desc if desc else "")

def _tg_post(endpoint: str, data=None, files=None, timeout=20):
    url = f"https://api.telegram.org/bot{TELEGRAM_TOKEN}/{endpoint}"
    r = requests.post(url, data=data or {}, files=files or None, timeout=timeout)
    if r.status_code == 429:
        try:
            payload = r.json()
            retry_after = (payload.get("parameters") or {}).get("retry_after", 10)
        except Exception:
            retry_after = 10
        print(f"[TG] 429 rate limit. Esperando {retry_after}s…", flush=True)
        time.sleep(float(retry_after))
        return _tg_post(endpoint, data=data, files=files, timeout=timeout)
    r.raise_for_status()
    return r

def send_telegram(m: Movie):
    if not SEND_TO_TELEGRAM or not TELEGRAM_TOKEN or not TELEGRAM_CHAT_ID:
        return
    global _last_tg_sent_ts
    try:
        _tg_wait_min_delay()
        caption = _tg_caption(m)
        files = None
        data = {"chat_id": TELEGRAM_CHAT_ID, "parse_mode": "HTML", "caption": caption}
        if m.poster_url:
            resp = SESSION.get(m.poster_url, timeout=TIMEOUT, stream=True)
            resp.raise_for_status()
            buf = io.BytesIO()
            total = 0
            for chunk in resp.iter_content(64 * 1024):
                if not chunk: break
                total += len(chunk)
                if total > TG_IMG_MAX_BYTES:
                    files = None
                    break
                buf.write(chunk)
            if total and total <= TG_IMG_MAX_BYTES:
                buf.seek(0)
                files = {"photo": ("poster.jpg", buf, "image/jpeg")}
        if files:
            _tg_post("sendPhoto", data=data, files=files)
        else:
            data = {"chat_id": TELEGRAM_CHAT_ID, "parse_mode": "HTML", "text": caption}
            _tg_post("sendMessage", data=data)
        _last_tg_sent_ts = time.time()
    except Exception as e:
        print(f"[WARN] Telegram fallo: {e}", flush=True)

# ================== Scraping =============================
def extract_detail_links_from_listing(html: str) -> List[str]:
    links: List[str] = []
    seen: Set[str] = set()

    # 1) Regex muy estricta: /pelicula/<id>/<slug>
    for m in DETAIL_HREF_RE.finditer(html):
        u = urljoin(BASE, m.group(0))
        if u not in seen:
            seen.add(u); links.append(u)

    # 2) Extra con BeautifulSoup filtrando el patrón válido
    soup = BeautifulSoup(html, "html.parser")
    for a in soup.select('a[href*="/pelicula/"]'):
        href = a.get("href") or ""
        if not href:
            continue
        m = re.search(r"/pelicula/\d+/[A-Za-z0-9\-\_]+", href, re.I)
        if not m:
            continue
        u = urljoin(BASE, m.group(0))
        if u not in seen:
            seen.add(u); links.append(u)

    return links

def crawl_listing_pages(pages: Iterable[int]) -> List[str]:
    out: List[str] = []
    for n in pages:
        url = LISTING.format(n=n)
        try:
            r = SESSION.get(url, timeout=TIMEOUT)
            r.raise_for_status()
        except Exception as e:
            print(f"[WARN] listado fallo {url}: {e}")
            continue
        links = extract_detail_links_from_listing(r.text)
        print(f"[INFO] page {n}: {len(links)} enlaces")
        out.extend(links)
        time.sleep(PAGE_DELAY)
    # dedup
    seen: Set[str] = set(); uniq: List[str] = []
    for u in out:
        if u not in seen:
            seen.add(u); uniq.append(u)
    return uniq

def parse_detail(url: str) -> Optional[Movie]:
    try:
        soup = get_soup(url)
    except Exception as e:
        print(f"[WARN] detalle fallo {url}: {e}")
        return None

    # Título
    raw_title = pick_meta(soup, "og:title") or ""
    if not raw_title:
        h = soup.find(["h1","h2"])
        if h: raw_title = h.get_text(" ", strip=True)
    title = clean_title(raw_title)

    # Año
    year: Optional[int] = None
    for node in soup.find_all(string=re.compile(r"^\s*Año\s*:\s*", re.I)):
        year = parse_year_any(str(node))
        if year: break
    if not year and title:
        year = parse_year_any(title)
    if not year:
        year = parse_year_any(soup.get_text(" ", strip=True))

    # Descripción
    description = pick_description(soup)

    # Póster
    poster_url = pick_poster_url(soup, url)

    # Torrent/magnet
    torrent = ""
    for a in soup.select("a[href]"):
        href = (a.get("href") or "").strip()
        if not href: continue
        if MAGNET_RE.match(href) or TORRENT_EXT_RE.search(href) or "/torrent/" in href or "descargar-torrent" in href:
            torrent = urljoin(url, href)
            break

    # ID
    m_id = re.search(r"/pelicula/(\d+)/", url)
    pid = m_id.group(1) if m_id else url

    # Infohash
    infohash = get_infohash_from_url(torrent) if torrent else ""

    if not title and not torrent:
        return None

    return Movie(
        id=pid,
        title=title or "(sin título)",
        year=year,
        description=description,
        author="",
        page_url=url,
        poster_url=poster_url,
        torrent=torrent,
        infohash=infohash or ""
    )

# ================== OMDb Enrichment ======================
def omdb_enrich(title: str, year: Optional[int]) -> dict:
    """Consulta OMDb por título (y año si hay) y devuelve dict con campos limpios."""
    if not OMDB_API_KEY or OMDB_API_KEY.startswith("PON_"):
        return {}
    params = {
        "apikey": OMDB_API_KEY,
        "type": "movie",
        "t": title,
    }
    if year:
        params["y"] = str(year)
    # usar HTTPS y forzar utf-8
    url = "https://www.omdbapi.com/?" + "&".join(f"{k}={quote_plus(v)}" for k, v in params.items())
    try:
        r = SESSION.get(url, timeout=TIMEOUT)
        r.raise_for_status()
        data = r.json()
    except Exception:
        return {}
    if not data or data.get("Response") != "True":
        return {}
    out = {}
    # imdb rating / votes
    try:
        rating = float(data.get("imdbRating")) if data.get("imdbRating") not in (None, "N/A") else None
    except Exception:
        rating = None
    try:
        votes = data.get("imdbVotes")
        votes = int(votes.replace(",", "")) if votes and votes != "N/A" else None
    except Exception:
        votes = None
    out["rating_imdb"] = rating
    out["votes_imdb"]  = votes
    imdb_id = data.get("imdbID") if data.get("imdbID") not in (None, "N/A") else None
    if imdb_id: out["imdb_id"] = imdb_id
    # genres
    genres = []
    raw_genres = data.get("Genre") or ""
    if raw_genres and raw_genres != "N/A":
        genres = [g.strip() for g in raw_genres.split(",") if g.strip()]
    out["genres"] = genres
    # runtime (en minutos)
    runtime_min = None
    rt = data.get("Runtime") or ""
    m = re.search(r"(\d+)", rt)
    if m:
        runtime_min = int(m.group(1))
    out["runtime_min"] = runtime_min
    # poster mejorado
    poster = data.get("Poster")
    if poster and poster != "N/A":
        out["poster_url"] = poster.replace("http://", "https://")
    # si OMDb trae un título mejor normalizado, podríamos usarlo (pero mantenemos el scrape local)
    return out

def merge_omdb(movie: Movie, extra: dict) -> Movie:
    if not extra:
        return movie
    # Poster: si el existente está vacío o es dudoso (svg/qr), usa el de OMDb
    def poster_ok(u: str) -> bool:
        return bool(u) and looks_like_poster(u)
    if (not poster_ok(movie.poster_url)) and poster_ok(extra.get("poster_url", "")):
        movie.poster_url = extra["poster_url"]
    # Campos numéricos
    if extra.get("rating_imdb") is not None:
        movie.rating_imdb = extra["rating_imdb"]
    if extra.get("votes_imdb") is not None:
        movie.votes_imdb = extra["votes_imdb"]
    if extra.get("runtime_min") is not None:
        movie.runtime_min = extra["runtime_min"]
    # Otros
    if extra.get("imdb_id"):
        movie.imdb_id = extra["imdb_id"]
    if extra.get("genres"):
        movie.genres = extra["genres"]
    return movie

# =================== Main ===============================
def main():
    conn = db_init(DB_FILE)
    tr = tr_connect() if SEND_TO_TRANSMISSION else None
    if SEND_TO_TRANSMISSION and not tr:
        print("[WARN] SEND_TO_TRANSMISSION=True pero no hay conexión a Transmission.")

    detail_urls = crawl_listing_pages(range(1, MAX_PAGES + 1))
    if not detail_urls:
        print("[INFO] No se hallaron enlaces en listados.")
        return

    movies: List[Movie] = []
    for u in detail_urls:
        m = parse_detail(u)
        if not m:
            time.sleep(1.2)
            m = parse_detail(u)
        if not m:
            time.sleep(DETAIL_DELAY)
            continue

        # OMDb enrichment
        try:
            extra = omdb_enrich(m.title, m.year)
            m = merge_omdb(m, extra)
        except Exception as e:
            print(f"[WARN] OMDb enrich fallo para '{m.title}': {e}")

        # Dedupe por infohash (o hash del torrent URL si no hay infohash)
        dedupe_key = m.infohash or (m.torrent and hashlib.sha1(m.torrent.encode()).hexdigest().upper()) or ""
        is_dup = False
        if dedupe_key:
            is_dup = db_seen(conn, dedupe_key)

        added_to_tr = False
        if not is_dup and dedupe_key:
            db_mark(conn, dedupe_key, m.torrent or "")
            if tr and m.torrent:
                added_to_tr = tr_add(tr, m.torrent, download_dir=TR_DOWNLOAD_DIR)
                print(f"[TR] {'Enviado' if added_to_tr else 'FALLO'} -> {m.title}")

        # Telegram si procede
        if not is_dup and SEND_TO_TELEGRAM:
            if (tr and added_to_tr) or (not SEND_TO_TRANSMISSION):
                send_telegram(m)

        movies.append(m)
        print(f"[OK] {m.title} ({m.year or 's/f'}) {'[dup]' if is_dup else ''}")
        time.sleep(DETAIL_DELAY)

    # Guardar JSON
    with open("movies.json", "w", encoding="utf-8") as f:
        json.dump([asdict(x) for x in movies], f, ensure_ascii=False, indent=2)
    print(f"[DONE] Guardado movies.json con {len(movies)} títulos")

if __name__ == "__main__":
    main()