#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Module de scraping des commentaires d'un post Facebook (page Collectif)."""
import time, re, os, requests
from selenium.webdriver.common.by import By
from fb_common import session_facebook, charger_cookies_fb, UA, DATA

PAGE_NAME = "Collectif de Citoyens Ucclois"
PAGE_FRAG = "collectif.de.citoyens.ucclois"
PHOTOS_DIR = os.path.join(DATA, "photos")

ACTIONS = {"J'aime", "Répondre", "Partager", "Modifié", "Masquer", "Suivre",
           "Aimer", "Auteur", "Voir la traduction", "Top fan", "Plus pertinents",
           "Tout afficher", "Like", "Reply", "Toutes les réactions :"}


def _drop(t):
    if t in ACTIONS:
        return True
    if re.match(r'^(il y a .+|\d+\s?(min|h|j|sem|ans?)|\d{1,2}\s\w+\.?)$', t):
        return True
    if re.match(r'^\d+$', t):
        return True
    if len(t) <= 1:
        return True
    return False


def clean_profile(href):
    if not href:
        return None
    href = href.split("&__cft__")[0].split("&__tn__")[0].split("?__cft__")[0]
    href = re.sub(r'([?&])comment_id=[^&]*', r'\1', href)
    href = re.sub(r'([?&])reply_comment_id=[^&]*', r'\1', href)
    href = href.replace("?&", "?").rstrip("?&")
    return href


def author_id_from(profile):
    if not profile:
        return None
    m = re.search(r'profile\.php\?id=(\d+)', profile)
    if m:
        return m.group(1)
    m = re.search(r'facebook\.com/([^/?&]+)', profile)
    return m.group(1) if m else None


def bare_permalink(url):
    m = re.search(r'(https://www\.facebook\.com/collectif\.de\.citoyens\.ucclois/posts/pfbid[0-9A-Za-z]+)', url)
    return m.group(1) if m else None


def expand_all(d):
    """Deplie TOUT : commentaires de 1er niveau + chaque fil de reponses.
    Persistant (tolere des passes vides) et ne clique jamais 'Masquer'."""
    empties = 0
    for _ in range(35):
        clicked = 0
        # 1) charger plus de commentaires de premier niveau
        for kw in ["Voir plus de commentaires", "Afficher plus de commentaires",
                   "Plus de commentaires", "Voir d’autres commentaires",
                   "commentaires précédents"]:
            for e in d.find_elements(By.XPATH, "//span[contains(text(),'%s')]" % kw)[:10]:
                try:
                    d.execute_script("arguments[0].scrollIntoView({block:'center'});", e)
                    d.execute_script("arguments[0].click();", e); clicked += 1; time.sleep(0.7)
                except Exception:
                    pass
        # 2) deplier les fils de reponses (jamais 'Masquer')
        for e in d.find_elements(By.XPATH,
                "//span[contains(text(),'réponse') and not(contains(text(),'Masquer'))]")[:30]:
            try:
                d.execute_script("arguments[0].scrollIntoView({block:'center'});", e)
                d.execute_script("arguments[0].click();", e); clicked += 1; time.sleep(0.4)
            except Exception:
                pass
        d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.1)
        if clicked == 0:
            empties += 1
            if empties >= 3:
                break
        else:
            empties = 0


def parse_comments(d):
    arts = d.find_elements(By.CSS_SELECTOR,
                           '[aria-label^="Commentaire de"], [aria-label^="Réponse de"]')
    by_id = {}
    for a in arts:
        try:
            al = a.get_attribute("aria-label") or ""
            links = [x.get_attribute("href") or "" for x in
                     a.find_elements(By.CSS_SELECTOR, "a[href*='facebook.com']")]
            cid = rid = None
            for L in links:
                m = re.search(r'[?&]comment_id=(\d+)', L)
                if m and not cid:
                    cid = m.group(1)
                m2 = re.search(r'[?&]reply_comment_id=(\d+)', L)
                if m2 and not rid:
                    rid = m2.group(1)
            is_reply = al.startswith("Réponse de")
            own_id = rid if (is_reply and rid) else cid
            parent_id = cid if (is_reply and rid) else None
            if not own_id:
                continue
            if is_reply:
                mn = re.match(r'Réponse de (.+?) au commentaire de', al)
            else:
                mn = re.match(r'Commentaire de (.+?)(?: il y a .*| \d+\s?(?:min|h|j|sem|an).*)?$', al)
            author = mn.group(1).strip() if mn else "?"
            mt = re.search(r'il y a (.+)$', al)
            time_text = ("il y a " + mt.group(1)) if mt else ""
            prof = None
            for L in links:
                if "/posts/" in L or "story_fbid" in L:
                    continue
                prof = clean_profile(L)
                break
            is_page = (author == PAGE_NAME) or (prof and PAGE_FRAG in prof and "/posts/" not in prof)
            # message
            divs = d.execute_script(
                "return Array.from(arguments[0].querySelectorAll('div[dir=\"auto\"]'))"
                ".map(e=>e.textContent.trim()).filter(Boolean);", a)
            seen = set(); parts = []
            for t in divs:
                if t in (author, PAGE_NAME):
                    continue
                if _drop(t):
                    continue
                if t not in seen:
                    seen.add(t); parts.append(t)
            message = " ".join(parts).strip()
            # photos
            photos = []
            for im in a.find_elements(By.CSS_SELECTOR, "img"):
                src = im.get_attribute("src") or ""
                if "scontent" in src and "_s." not in src and "/s148x148/" not in src \
                        and "/p148x148/" not in src and "/p50x50/" not in src:
                    if src not in photos:
                        photos.append(src)
            by_id[own_id] = {"id": own_id, "parent": parent_id, "is_reply": is_reply,
                             "author": author, "profile": prof,
                             "author_id": author_id_from(prof),
                             "is_page": bool(is_page), "message": message,
                             "time_text": time_text, "photos": photos[:6], "label": al}
        except Exception:
            pass
    return by_id


def post_excerpt(d):
    h = d.page_source
    m = re.search(r'<meta property="og:description" content="([^"]*)"', h)
    return (m.group(1)[:600] if m else "")


def download_photos(comment_id, urls):
    """Telecharge les photos d'un commentaire, renvoie liste de chemins relatifs."""
    if not urls:
        return []
    os.makedirs(PHOTOS_DIR, exist_ok=True)
    ck = {c["name"]: c["value"] for c in charger_cookies_fb()}
    s = requests.Session()
    for k, v in ck.items():
        s.cookies.set(k, v, domain=".facebook.com")
    out = []
    for i, u in enumerate(urls):
        try:
            r = s.get(u, headers={"User-Agent": UA}, timeout=30)
            if r.status_code == 200 and len(r.content) > 800:
                ext = ".jpg"
                fn = "%s_%d%s" % (comment_id, i, ext)
                open(os.path.join(PHOTOS_DIR, fn), "wb").write(r.content)
                out.append("photos/" + fn)
        except Exception:
            pass
    return out


def set_all_comments(d):
    """Bascule le tri des commentaires sur 'Tous les commentaires' (sinon FB masque)."""
    btn = None
    for el in d.find_elements(By.XPATH, "//div[@role='button'][contains(.,'pertinent') or contains(.,'Pertinent') or contains(.,'Most relevant')]"):
        t = (el.text or "").strip()
        if t and len(t) < 40:
            btn = el; break
    if not btn:
        return False
    try:
        d.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
        d.execute_script("arguments[0].click();", btn); time.sleep(1.6)
    except Exception:
        return False
    for o in d.find_elements(By.XPATH, "//*[@role='menuitem'] | //*[@role='menuitemradio']"):
        tx = o.text or ""
        if "Tous les commentaires" in tx or "All comments" in tx:
            try:
                d.execute_script("arguments[0].click();", o); time.sleep(3.5); return True
            except Exception:
                return False
    return False


def post_text(d):
    """Texte INTEGRAL du post (deplie 'Voir plus', prend le plus long bloc avant le 1er commentaire)."""
    for sp in d.find_elements(By.XPATH, "//div[@role='article']//div[@role='button'][contains(.,'Voir plus')] | //span[text()='Voir plus']")[:4]:
        try:
            d.execute_script("arguments[0].click();", sp); time.sleep(0.6)
        except Exception:
            pass
    js = """
    const arts=[...document.querySelectorAll('[aria-label^=\"Commentaire de\"],[aria-label^=\"Réponse de\"]')];
    const firstC=arts.length?arts[0]:null;
    const all=[...document.querySelectorAll('div[dir=\"auto\"]')];
    let best='';
    for(const e of all){
      if(firstC){const pos=firstC.compareDocumentPosition(e);
        if(!(pos & Node.DOCUMENT_POSITION_PRECEDING)) continue;}
      const t=e.textContent.trim();
      if(t.length>best.length) best=t;
    }
    return best;
    """
    try:
        return (d.execute_script(js) or "").strip()
    except Exception:
        return ""


def scrape_post(d, url):
    url = bare_permalink(url) or url
    d.get(url)
    time.sleep(6)
    ptxt = post_text(d)
    excerpt = ptxt if len(ptxt) >= 30 else post_excerpt(d)
    switched = set_all_comments(d)
    expand_all(d)
    data = parse_comments(d)
    return {"url": url, "excerpt": excerpt, "tri_tous": switched, "comments": data}