from playwright.sync_api import sync_playwright
import time
import json
import urllib.parse
import os


#data_dir = "C:/Users/Mech/Desktop/laragon-6.0.0/www/data"
data_dir = "/var/www/html/data"

bloque_detecte = {"val": True}

def take_google_screenshot_stealth(search_word, index):

    start_time = time.time()  # Demarre le chronometre

    waittime = 8.0

    search_encoded = urllib.parse.quote(search_word)
    search_url = f"https://www.google.com/search?q={search_encoded}&hl=fr"


    with sync_playwright() as p:
        browser = p.chromium.launch(
            headless=True,
            args=[
                '--no-sandbox',
                '--disable-dev-shm-usage',
                '--disable-blink-features=AutomationControlled',
                '--disable-extensions',
                '--disable-plugins',
                '--disable-default-apps',
                '--disable-background-timer-throttling',
                '--disable-renderer-backgrounding',
                '--disable-backgrounding-occluded-windows',
                '--disable-features=TranslateUI',
                '--disable-ipc-flooding-protection',
                '--no-first-run',
                '--no-default-browser-check',
                '--disable-web-security',
                '--disable-features=VizDisplayCompositor',
                '--start-maximized',
                '--window-size=1920,1080'
            ]
        )


        context = browser.new_context(
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
            viewport={'width': 1920, 'height': 1080},
            locale='fr-FR',
            timezone_id='Europe/Paris',
            extra_http_headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
            }
        )

        print("Page ")

        #if os.path.exists(os.path.join(data_dir, "cookies.json")) and (index + 1) % 5 != 0:
        #    with open(os.path.join(data_dir, "cookies.json"), "r") as f:
        #        cookies = json.load(f)
        #        context.add_cookies(cookies)
        #    print("Cookies restaur")

        page = context.new_page()

        page.add_init_script("""
            // Masquer webdriver
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined,
            });

            // Supprimer les traces d'automation
            delete navigator.__proto__.webdriver;

            Object.defineProperty(navigator, 'languages', {
                get: () => ['fr-FR', 'fr', 'en-US', 'en'],
            });

            Object.defineProperty(navigator, 'plugins', {
                get: () => [
                    {name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer'},
                    {name: 'Chromium PDF Plugin', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai'},
                    {name: 'Microsoft Edge PDF Plugin', filename: 'pdf'},
                    {name: 'PDF Viewer', filename: 'pdf'}
                ],
            });

            // Ajouter chrome runtime
            window.chrome = {
                runtime: {
                    onConnect: undefined,
                    onMessage: undefined,
                },
                loadTimes: function() {
                    return {
                        commitLoadTime: performance.now() - Math.random() * 1000,
                        finishDocumentLoadTime: performance.now() - Math.random() * 500,
                        finishLoadTime: performance.now() - Math.random() * 300,
                        firstPaintAfterLoadTime: performance.now() - Math.random() * 200,
                        firstPaintTime: performance.now() - Math.random() * 100,
                        navigationType: 'Other',
                        npnNegotiatedProtocol: 'h2',
                        requestTime: Date.now() / 1000 - Math.random() * 3,
                        startLoadTime: performance.now() - Math.random() * 1200,
                        wasAlternateProtocolAvailable: false,
                        wasFetchedViaSpdy: true,
                        wasNpnNegotiated: true
                    };
                }
            };

            // Masquer l'automation
            Object.defineProperty(navigator, 'permissions', {
                get: () => ({
                    query: () => Promise.resolve({ state: 'granted' }),
                }),
            });

            // Simuler des propriétés
            Object.defineProperty(screen, 'width', {
                get: () => 1920,
            });
            Object.defineProperty(screen, 'height', {
                get: () => 1080,
            });
            Object.defineProperty(screen, 'availWidth', {
                get: () => 1920,
            });
            Object.defineProperty(screen, 'availHeight', {
                get: () => 1040,
            });
            Object.defineProperty(screen, 'colorDepth', {
                get: () => 24,
            });
            Object.defineProperty(screen, 'pixelDepth', {
                get: () => 24,
            });

            // Masquer les propriétés headless
            Object.defineProperty(navigator, 'hardwareConcurrency', {
                get: () => 8,
            });

            Object.defineProperty(navigator, 'deviceMemory', {
                get: () => 8,
            });


        """)


        print(f"[{index}] Recherche : {search_word}")

        try:

            from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
            def bloquer_ressources(route, request):
                url = request.url

                if (
                    request.resource_type in ["image", "stylesheet", "font"]
                    or "https://www.google.com/gen_204" in request.url
                    or "https://www.google.com/xjs" in request.url
                    or "https://www.gstatic.com/" in request.url
                    or "https://csp.withgoogle.com/" in request.url
                    or ".webpkgcache.com" in request.url
                    #or "sei=" in request.url
                ):
                    route.abort()
                    return

                if "sei=" in url:

                    new_url = url + "&gl=FR&glp=1&start=0&num=50&complete=0&sa=N&pws=0&uule=w+CAIQIFISCQ-34gYfbuZHEWCUjGjDggsE&kgmid=/m/01"
                    bloque_detecte["val"] = False
                    print(f"URL modif :\nDe: {url}\n : {new_url}")
                    route.continue_(url=new_url)
                    return


                route.continue_()

            bloque_detecte["val"] = True

            page.route("**/*", bloquer_ressources)

            page.goto(search_url, wait_until="domcontentloaded", timeout=30000)

            if bloque_detecte["val"]:
                print("Des ressources ont ete bloquees.")
                search_url = f"https://www.google.com/search?q={search_encoded}&hl=fr&gl=FR&glp=1&start=0&num=50&complete=0&sa=N&pws=0&uule=w+CAIQIFISCQ-34gYfbuZHEWCUjGjDggsE&kgmid=/m/01"  #&gsc=1&ucbcb=1
                page.goto(search_url, wait_until="domcontentloaded", timeout=30000)

            #page.goto(search_url, wait_until="domcontentloaded", timeout=30000)

            faire_requete_curl()
            #faire_requete_curl()

            #if (index + 1) % 5 == 0:
            #    faire_requete_curl()
            #    faire_requete_curl()
            #    waittime = 16


            # ✅ Accepter le consentement si présent
            try:
                page.click("button:has-text('Tout accepter')", timeout=5000)
                page.click('[aria-label="Afficher plus, Aper\E7u\A0IA"]');
                print("✅ Consentement accepté.")
                time.sleep(1)
            except PlaywrightTimeoutError:
                print("ℹ️ Pas de consentement détecté.")

            try:
                page.wait_for_selector('div#search', timeout=5000)
                print("  > Resultats trouves")
            except:
                print("  > Timeout sur les resultats")


            html_content = page.content()


            liens = extraire_liens_resultats_google(page)
            print("/n?? Resultats Google extraits :")
            for lien in liens:
                print(f"{lien['index']:02d} | Y: {lien['y']} px | {lien['url']}")

            if 'folsrch' in html_content:
                print("  > Mot-cle 'folsrch' detecte dans le HTML, pause supplementaire...")
                time.sleep(1.5)
                html_content = page.content()

            html_file_path = os.path.join(data_dir, f"html_result_{index}.html")
            with open(html_file_path, 'w', encoding='utf-8') as f:
                f.write(html_content)


            # Sauvegarder les cookies
            cookies = context.cookies()
            with open(os.path.join(data_dir, "cookies.json"), "w") as f:
              json.dump(cookies, f, indent=2)




            #screenshot_path = os.path.join(data_dir, f"screenshot_{index}_{search_word[:20]}.png")
            #page.screenshot(path=screenshot_path, full_page=True)

            #print(f"  > Screenshot enregistre : {screenshot_path}")


            elapsed = time.time() - start_time
            remaining = waittime - elapsed
            if remaining > 0:
                print(f"? Attente de {remaining:.2f} secondes pour completer 8.0s")
                time.sleep(remaining)
            else:
                print(f"?? Temps deja ecoule : {elapsed:.2f}s (> 8s), pas d'attente")


        except Exception as e:
            print(f"  > Erreur lors de la recherche '{search_word}': {e}")

        finally:
            browser.close()

import subprocess
import random

def faire_requete_curl():
    nombre = random.randint(100, 9999)  # Tu peux ajuster les bornes
    url = f"https://www.google.com/search?q={nombre}+euros"

    headers = [
        "Accept-Language:fr",
        "Accept:text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
        "Accept-Charset:UTF-8,*",
        "Accept-Encoding:gzip,deflate",
        "Cache-Control:max-age=0",
        "User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
    ]

    # Preparer les arguments curl
    curl_command = [
    "curl",
    "-s",
    "-k",
    "--max-time", "5",
    #"--proxy", "127.0.0.1:8888",
    url
    ]

    for header in headers:
        curl_command.extend(["-H", header])

    print(f"?? Requete CURL vers : {url}")
    try:
        response = subprocess.check_output(curl_command, stderr=subprocess.DEVNULL).decode('utf-8', errors='ignore')
        print("? Requete CURL terminee (contenu non affiche)")
        curl_result_path = os.path.join(data_dir, "curl_result.html")
        with open(curl_result_path, "w", encoding="utf-8") as f:
            f.write(response)
    except subprocess.CalledProcessError as e:
        print(f"? echec de la requete CURL : {e}")


def extraire_liens_resultats_google(page):
    try:
        # Attendre que les resultats soient bien charges
        page.wait_for_selector("div#search", timeout=10000)
    except:
        print("? Resultats de recherche non trouves")
        return []

    # Selecteurs courants des blocs de resultats
    result_blocks = page.query_selector_all("div.tF2Cxc")  # bloc principal d un resultat

    liens_info = []

    for i, bloc in enumerate(result_blocks):
        lien = bloc.query_selector("a")

        if lien:
            href = lien.get_attribute("href")
            text = lien.inner_text().strip()
            box = lien.bounding_box()

            if href and box:
                liens_info.append({
                    "index": i + 1,
                    "y": int(box["y"]),
                    "text": text[:100],
                    "url": href
                })

    return liens_info


def main():
    mots_file = os.path.join(data_dir, f"mots.txt")

    if not os.path.exists(mots_file):
        print(f"Fichier non trouve : {mots_file}")
        return

    with open(mots_file, 'r', encoding='utf-8') as f:
        mots = [line.strip() for line in f if line.strip()]

    print(f"{len(mots)} mots a traiter...")

    for index, mot in enumerate(mots, start=1):

        take_google_screenshot_stealth(mot, index)

        #time.sleep(2)  # pause de 2 secondes entre les recherches
        #faire_requete_curl()
        #faire_requete_curl()

        #if (index + 1) % 6000000 == 0:
            #faire_requete_curl()



if __name__ == "__main__":
    main()

