Coding Showcase

Mostly in Python but some R, Bash Shell Scripts too (**Disclaimer**Use at your own risk!.

(code propre)_RI_tp1_6avril25.ipynb

Ce script Python configure un environnement complet pour la recherche d'informations, incluant l'installation de dépendances système et Python, l'extraction et le prétraitement de documents, la création d'index Pyserini (Lucene), l'exécution de requêtes (BM25, TF-IDF, RM3) et l'évaluation des performances TREC.

Mots-clés: Récupération d'informations,Pyserini,Indexation,Prétraitement texte,Évaluation performance

# === ====================================#
# ===  Étape 0: Configuration Complète # ===
# =======================================#

# === Cellule 0: Configuration Complète (Tout-en-un) ===
# Installe Java 21, configure comme défaut, installe outils build,
# pybind11, dernière Pyserini, NLTK+ressources, définit chemins, fonctions, parse topics.

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q"
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab'] # Liste corrigée
for resource in nltk_resources:
    try:
        if resource == 'punkt' or resource == 'punkt_tab':
            resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4':
             resource_path = f'corpora/{resource}.zip'
        elif resource == 'wordnet':
             resource_path = f'corpora/{resource}.zip'
        else:
            resource_path = f'corpora/{resource}.zip'
        nltk.data.find(resource_path)
    except LookupError: # Gestion d'erreur corrigée
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try:
            nltk.download(resource, quiet=True)
            print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download:
            print(f"  ERREUR lors du téléchargement de '{resource}': {e_download}")
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive (corrigé)
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    """Applique la tokenisation, la mise en minuscule, la suppression de la ponctuation, la suppression des stop words et la lemmatisation."""
    if not isinstance(text, str): return ""
    try:
        tokens = word_tokenize(text.lower())
    except LookupError as e_tok:
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]
              print(f"--- Tokenizer a besoin de '{resource_name}', tentative de téléchargement ---")
              try:
                  nltk.download(resource_name, quiet=True)
                  print(f"--- Ressource '{resource_name}' téléchargée, nouvelle tentative de tokenisation ---")
                  tokens = word_tokenize(text.lower())
              except Exception as e_dl_tok:
                  print(f"--- Échec du téléchargement de '{resource_name}': {e_dl_tok} ---")
                  raise e_tok
         else: raise e_tok
    except Exception as e_tok_other:
         print(f"Erreur inattendue dans word_tokenize: {e_tok_other}")
         raise e_tok_other
    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
import re
import glob
def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title: topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError: print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic: print(f"  ATTENTION: Erreur parsing {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else: topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
all_topics = {}
if not topic_files: print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    print(f"  Parsing des fichiers topics: {topic_files}")
    for tf in topic_files: all_topics.update(parse_topics(tf))

try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés.")
    print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes...")
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries:
     print(f"\nERREUR lors du prétraitement des requêtes: {e_preproc_queries}")
     queries_short_preprocessed, queries_long_preprocessed = {}, {}

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout: print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else: print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e: print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check: print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")

# --- Nouvelle Cellule ---

# === Cellule 1: Extraire, Décompresser et Formater les Documents ===
# Lit AP.tar, décompresse les .gz internes, extrait <DOC>, <DOCNO>, <TEXT>
# et écrit le résultat dans ap_docs.jsonl.

import tarfile
import re
import json
import gzip # Module pour décompresser
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins sont définis
try:
    AP_TAR_PATH
    CORPUS_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")

# Regex pour extraire les infos
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteurs
doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

try:
    # Ouvrir le fichier de sortie et l'archive TAR
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.")

        # Boucler sur chaque membre de l'archive
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer si ce n'est pas un fichier .gz ou .Z
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Réinitialiser pour chaque fichier

            try:
                # Extraire le contenu compressé
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # Décompresser le contenu
                    try:
                        content_bytes = gzip.decompress(compressed_content)
                        content = content_bytes.decode('utf-8', errors='ignore') # Décoder après décompression
                    except gzip.BadGzipFile: # Gérer si ce n'est pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au suivant

                    # Trouver tous les blocs <DOC> dans le contenu décompressé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches: continue # Passer si aucun doc trouvé

                    # Boucler sur chaque document trouvé
                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match: continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        # Nettoyer le texte (espaces multiples)
                        doc_text = ' '.join(text_match.group(1).strip().split()) if text_match else ""

                        # Écrire la ligne JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key: print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}"); skipped_members += 1
            except EOFError: print(f"\nAvertissement: Fin fichier inattendue {member.name}."); skipped_members += 1
            except Exception as e_extract: print(f"\nErreur extraction/lecture {member.name}: {e_extract}"); skipped_members += 1

except tarfile.ReadError as e_tar: print(f"\nERREUR lecture TAR {AP_TAR_PATH}: {e_tar}"); raise e_tar
except FileNotFoundError: print(f"\nERREUR: Fichier TAR {AP_TAR_PATH} non trouvé."); raise FileNotFoundError
except Exception as e_general: print(f"\nERREUR générale traitement TAR: {e_general}"); traceback.print_exc(); raise e_general

# Afficher le résumé de l'extraction
print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0: print(f"  {decompression_errors} erreurs/avertissements décompression.")
print(f"  {doc_count} documents écrits dans {JSONL_OUTPUT_PATH}")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size > 0 and doc_count > 0: print("  SUCCÈS: Fichier de sortie contient des données.")
    else: print("  ATTENTION: Fichier de sortie vide ou aucun document écrit.")
else: print(f"  ATTENTION: Fichier {JSONL_OUTPUT_PATH} non créé.")



# --- Nouvelle Cellule ---

# === Cellule 2: Indexation Baseline ===
# Crée l'index Lucene à partir de ap_docs.jsonl (sans prétraitement spécifique).

import os
import subprocess
import traceback

# Vérifier que les chemins sont définis
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Début de l'indexation Baseline...")
print(f"Dossier source: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide.")

# Commande Pyserini
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4",
    "--storePositions", "--storeDocvectors", "--storeRaw"
]

print(f"Exécution: {' '.join(index_cmd_baseline)}")
try:
    # Exécuter la commande d'indexation
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:]) # Afficher la fin de stdout
    print("Sortie STDERR:\n", result.stderr)
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique 0 document indexé.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except Exception as e:
    # Gérer les erreurs potentielles
    print(f"\nERREUR pendant l'indexation Baseline: {e}")
    if isinstance(e, subprocess.CalledProcessError):
        print("Sortie STDOUT:\n", e.stdout)
        print("Sortie STDERR:\n", e.stderr)
    else:
        traceback.print_exc()
    raise e

# Vérifier la taille de l'index créé
print(f"\nVérification taille index: {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier taille: {e_du}")
else:
    print("  ATTENTION: Dossier index non créé.")


# --- Nouvelle Cellule ---

# === Cellule 3: Préparer les Données Prétraitées ===
# Lit ap_docs.jsonl, applique la fonction preprocess_text (lemmatisation, etc.)
# et écrit le résultat dans ap_docs_preprocessed.jsonl.

import json
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins et la fonction sont définis
try:
    CORPUS_DIR
    JSONL_OUTPUT_PATH # Défini dans la config, utilisé comme entrée ici
    preprocess_text # Défini dans la config
except NameError:
    print("ERREUR: Variables/Fonction non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier de sortie

print(f"Préparation données prétraitées depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# Vérifier fichier source
if not os.path.exists(JSONL_OUTPUT_PATH) or os.path.getsize(JSONL_OUTPUT_PATH) == 0:
     raise FileNotFoundError(f"Le fichier source {JSONL_OUTPUT_PATH} est manquant ou vide.")

doc_count_preproc = 0
error_count = 0
try:
    # Ouvrir les fichiers d'entrée et de sortie
    with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
         open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:
        # Boucler sur chaque ligne du fichier d'entrée
        for line in tqdm(infile, desc="Prétraitement des documents"):
            try:
                data = json.loads(line)
                doc_id = data.get('id', None)
                original_contents = data.get('contents', '')
                if doc_id is None: error_count += 1; continue

                # Appliquer le prétraitement
                preprocessed_contents = preprocess_text(original_contents)

                # Écrire la ligne traitée
                json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                outfile.write(json_line + '\n')
                doc_count_preproc += 1
            except json.JSONDecodeError: error_count += 1 # Compter les erreurs JSON
            except Exception as e_line: print(f"\nErreur ligne (id={data.get('id', 'inconnu')}): {e_line}"); error_count += 1

    # Afficher le résumé
    print(f"\nTerminé.")
    print(f"  {doc_count_preproc} documents prétraités écrits dans {JSONL_PREPROC_PATH}")
    if error_count > 0: print(f"  {error_count} lignes ignorées.")

    # Vérifier la taille du fichier de sortie
    if os.path.exists(JSONL_PREPROC_PATH):
        output_size = os.path.getsize(JSONL_PREPROC_PATH)
        print(f"  Taille finale: {output_size} octets.")
        if output_size == 0 and doc_count_preproc > 0: print("  ATTENTION: Taille nulle ?!")
    else: print(f"  ATTENTION: Fichier sortie {JSONL_PREPROC_PATH} non créé.")

except Exception as e_main:
    print(f"ERREUR générale préparation données prétraitées: {e_main}")
    traceback.print_exc()
    raise


# --- Nouvelle Cellule ---

# === Cellule 4: Indexation Prétraitée ===
# Crée l'index Lucene à partir de ap_docs_preprocessed.jsonl.
# Utilise l'option --pretokenized car le texte est déjà traité.

import os
import subprocess
import traceback

# Vérifier que les chemins sont définis
try:
    CORPUS_DIR
    INDEX_DIR_PREPROC
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source

print(f"Début de l'indexation Prétraitée...")
print(f"Collection source (dossier): {CORPUS_DIR}")
print(f"Fichier JSONL prétraité attendu: {JSONL_PREPROC_PATH}")
print(f"Répertoire de l'index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe et n'est pas vide
if not os.path.exists(JSONL_PREPROC_PATH) or os.path.getsize(JSONL_PREPROC_PATH) == 0:
    raise FileNotFoundError(f"Le fichier données prétraitées {JSONL_PREPROC_PATH} est manquant ou vide.")

# Commande Pyserini
index_cmd_preproc = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4",
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Option clé ici
]

print(f"Exécution: {' '.join(index_cmd_preproc)}")
try:
    # Exécuter la commande d'indexation
    result = subprocess.run(index_cmd_preproc, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:])
    print("Sortie STDERR:\n", result.stderr)
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique 0 document indexé.")
    else:
        print(f"\nIndexation Prétraitée terminée. Index créé dans {INDEX_DIR_PREPROC}")
except Exception as e:
    # Gérer les erreurs
    print(f"\nERREUR pendant l'indexation Prétraitée: {e}")
    if isinstance(e, subprocess.CalledProcessError):
        print("Sortie STDOUT:\n", e.stdout)
        print("Sortie STDERR:\n", e.stderr)
    else:
        traceback.print_exc()
    raise e

# Vérifier la taille de l'index
print(f"\nVérification taille index: {INDEX_DIR_PREPROC}...")
if os.path.exists(INDEX_DIR_PREPROC):
    du_cmd = f"du -sh '{INDEX_DIR_PREPROC}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier taille: {e_du}")
else:
    print("  ATTENTION: Dossier index non créé.")



# --- Nouvelle Cellule ---

# === Cellule 5: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Lance les 8 combinaisons de recherche et sauvegarde les résultats.
# Assurez-vous que l'environnement Java 21 est toujours actif.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import autoclass, JavaException # Pour TF-IDF

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Charger ClassicSimilarity
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity'); print("ClassicSimilarity chargée.")
except Exception as e: print(f"ERREUR chargement ClassicSimilarity: {e}"); ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise

# Fonction de recherche (inchangée par rapport à la version précédente fonctionnelle)
def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")
    all_results_list = []
    searcher = None
    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model == 'bm25': print("  Config BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None: print("ERREUR: ClassicSimilarity non chargée. ABANDON."); return
            print("  Config ClassicSimilarity (TF-IDF)...")
            try: searcher.set_similarity(ClassicSimilarity()); print("  ClassicSimilarity configurée.")
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle '{model}' non reconnu, utilise BM25."); searcher.set_bm25()

        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue
                hits = searcher.search(search_text, k=k)
                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        if all_results_list:
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites.")
        else: print("\n  Avertissement: Aucun résultat généré.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")
        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations ---
print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")



# --- Nouvelle Cellule ---

# === Cellule 6: Évaluation des Runs (Partie 1 et/ou Finale) ===
# Lit les fichiers Qrels, lit les fichiers de résultats (.txt) du dossier RUN_DIR,
# calcule MAP et P@10, et affiche/sauvegarde les tableaux récapitulatifs.
# Peut être exécutée après l'étape 5 (Recherche) et ré-exécutée après l'étape 7 (RM3).

import pandas as pd
import glob
import pytrec_eval
import os
import traceback

# Vérifier que les chemins sont définis
try:
    QRELS_DIR
    RUN_DIR
    EVAL_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Préparation des Qrels depuis: {QRELS_DIR}")
qrels_files = sorted(glob.glob(os.path.join(QRELS_DIR, "qrels.*.txt")))
if not qrels_files: print(f"ATTENTION: Aucun fichier Qrels trouvé dans {QRELS_DIR}."); qrels_dict = {}
else:
    print(f"Fichiers Qrels trouvés: {qrels_files}")
    all_qrels_data = []
    for qf in qrels_files:
        try:
            qrels_df = pd.read_csv(qf, sep='\s+', names=['query_id', 'unused', 'doc_id', 'relevance'], dtype={'query_id': str, 'unused': str, 'doc_id': str, 'relevance': int})
            all_qrels_data.append(qrels_df[['query_id', 'doc_id', 'relevance']])
        except Exception as e: print(f"Erreur lecture Qrels {qf}: {e}")
    if not all_qrels_data: print("ERREUR: Impossible lire données Qrels."); qrels_dict = {}
    else:
        combined_qrels_df = pd.concat(all_qrels_data, ignore_index=True)
        qrels_dict = {}
        for _, row in combined_qrels_df.iterrows():
            qid, did, rel = row['query_id'], row['doc_id'], int(row['relevance'])
            if rel < 0: continue
            if qid not in qrels_dict: qrels_dict[qid] = {}
            qrels_dict[qid][did] = rel
        print(f"Total {len(qrels_dict)} requêtes avec jugements chargées.")

# --- Évaluation des Runs ---
if not qrels_dict: print("\nAucun jugement de pertinence chargé, impossible d'évaluer.")
else:
    measures = {'map', 'P_10'}
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, measures)
    run_files = sorted(glob.glob(os.path.join(RUN_DIR, "*.txt"))) # Prend tous les .txt dans RUN_DIR
    print(f"\n{len(run_files)} fichiers de run à évaluer trouvés dans {RUN_DIR}.")

    results_summary = []
    if not run_files: print(f"ATTENTION: Aucun fichier de run (.txt) trouvé dans {RUN_DIR}.")
    else:
        for run_file in run_files:
            run_name = os.path.basename(run_file)
            print(f"\n--- Évaluation: {run_name} ---")
            run_dict = {}
            error_count = 0
            try:
                with open(run_file, 'r', encoding='utf-8') as f_run:
                    for line in f_run:
                        parts = line.strip().split();
                        if len(parts) != 6: error_count += 1; continue
                        qid, _, did, _, score, _ = parts
                        try: score = float(score)
                        except ValueError: error_count += 1; continue
                        if qid not in run_dict: run_dict[qid] = {}
                        run_dict[qid][did] = score
                if error_count > 0: print(f"  Avertissement: {error_count} lignes mal formatées ignorées.")

                filtered_run_dict = {qid: docs for qid, docs in run_dict.items() if qid in qrels_dict}
                ignored_q = len(run_dict) - len(filtered_run_dict)
                if ignored_q > 0: print(f"  Avertissement: {ignored_q} requêtes run ignorées (absentes Qrels).")
                if not filtered_run_dict: print("  Erreur: Aucune requête ne correspond aux Qrels."); continue

                eval_results = evaluator.evaluate(filtered_run_dict)
                all_maps = [q_res.get("map", 0) for q_res in eval_results.values()]
                all_p10s = [q_res.get("P_10", 0) for q_res in eval_results.values()]
                avg_map = sum(all_maps) / len(all_maps) if all_maps else 0
                avg_p10 = sum(all_p10s) / len(all_p10s) if all_p10s else 0

                print(f"  MAP: {avg_map:.4f}")
                print(f"  P@10: {avg_p10:.4f}")

                # Extraire infos pour résumé
                parts = run_name.replace('.txt','').split('_')
                if len(parts) >= 3:
                    index_type, query_type, model_type = parts[0], parts[1], parts[2]
                    if len(parts) > 3 and parts[3] == 'rm3': model_type += "+RM3" # Gérer RM3
                    results_summary.append({
                        "Run Name": run_name, "Index": index_type,
                        "Query Type": query_type.capitalize(),
                        "Weighting Scheme": model_type.upper(),
                        "MAP": avg_map, "P@10": avg_p10
                    })
                else: print(f"  Avertissement: Impossible parser nom run '{run_name}'.")

            except FileNotFoundError: print(f"  Erreur: Fichier run non trouvé: {run_file}")
            except Exception as e: print(f"  Erreur évaluation {run_name}: {e}"); traceback.print_exc()

        # Afficher et sauvegarder résumé
        if results_summary:
            print("\n\n=== Tableau Récapitulatif des Résultats ===")
            results_df = pd.DataFrame(results_summary)
            results_df = results_df.sort_values(by=["Index", "Query Type", "Weighting Scheme"]) # Trier

            try: # Afficher Tableaux Pivots
                pivot_map = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='MAP')
                print("\n--- MAP ---"); print(pivot_map.to_markdown(floatfmt=".4f"))
                pivot_p10 = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='P@10')
                print("\n--- P@10 ---"); print(pivot_p10.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\nErreur création tableaux pivots: {e_pivot}")

            # Sauvegarder le DataFrame complet
            summary_file_path = os.path.join(EVAL_DIR, "evaluation_summary_final.csv") # Nom final
            try:
                 results_df.to_csv(summary_file_path, index=False)
                 print(f"\nTableau récapitulatif complet sauvegardé: {summary_file_path}")
            except Exception as e_save: print(f"\nErreur sauvegarde résumé: {e_save}")
        else: print("\nAucun résultat d'évaluation à afficher.")


# --- Nouvelle Cellule ---

# === Cellule 7: Exécuter la Recherche Améliorée (RM3) ===
# Applique RM3 sur la meilleure configuration de base identifiée à l'étape 6.
# !! N'OUBLIEZ PAS DE CONFIGURER LES VARIABLES BEST_... CI-DESSOUS !!

from pyserini.search.lucene import LuceneSearcher
from jnius import autoclass, JavaException
from tqdm.notebook import tqdm
import time
import traceback
import os

# Recharger ClassicSimilarity au cas où
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
except Exception: ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; EVAL_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise

# --- À CONFIGURER selon vos meilleurs résultats de l'Étape 6 ---
# !! MODIFIEZ CECI EN FONCTION DE VOS RÉSULTATS D'ÉVALUATION !!
print("--- Configuration RM3 ---")
print("Veuillez éditer les variables BEST_... ci-dessous en fonction de vos meilleurs résultats MAP de l'étape précédente.")
# Exemple: si preproc + long + bm25 était le meilleur
BEST_INDEX_PATH = INDEX_DIR_PREPROC           # Ex: INDEX_DIR_BASELINE ou INDEX_DIR_PREPROC
BEST_QUERIES = queries_long_preprocessed      # Ex: queries_short, queries_long, ..._preprocessed
BEST_MODEL_BASE = 'bm25'                      # Ex: 'bm25' ou 'tfidf'
BEST_RUN_TAG_PREFIX = "preproc_long"          # Ex: 'baseline_short', 'preproc_long'
USE_PREPROC_QUERY_FOR_RM3 = False             # Généralement False si BEST_QUERIES est déjà prétraité
# ----------------------------------------------------------------
print(f"Configuration choisie pour RM3:")
print(f"  Index: {os.path.basename(BEST_INDEX_PATH)}")
# print(f"  Requêtes: (variable BEST_QUERIES)") # Difficile d'afficher le nom de la variable
print(f"  Modèle Base: {BEST_MODEL_BASE}")
print(f"  Préfixe Tag: {BEST_RUN_TAG_PREFIX}")
print(f"  Utiliser Preproc Requête?: {USE_PREPROC_QUERY_FOR_RM3}")

# Nom du fichier et tag pour le run RM3
PRF_RUN_FILE = os.path.join(RUN_DIR, f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3.txt")
RM3_RUN_TAG = f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3"

# Paramètres RM3
rm3_config = {'fb_terms': 10, 'fb_docs': 10, 'original_query_weight': 0.5}
print(f"  Paramètres RM3: {rm3_config}")

# --- Fonction de recherche RM3 (séquentielle) ---
# (Définition identique à celle de search_code_final, on peut la réutiliser si elle est dans la portée)
# Par sécurité, on la redéfinit ici au cas où l'utilisateur n'exécute que cette cellule après setup.
def perform_search_sequential_rm3(queries, index_path, model_base, k, output_run_file, run_tag, use_preprocessed_query=False, rm3_params=None):
    start_time = time.time()
    print(f"\nDébut recherche SÉQUENTIELLE RM3: Modèle='{model_base}+RM3', Tag='{run_tag}', k={k}")
    all_results_list = []
    searcher = None
    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model_base == 'bm25': print("  Config BM25 (base)..."); searcher.set_bm25(k1=0.9, b=0.4)
        elif model_base == 'tfidf':
            if ClassicSimilarity is None: raise ValueError("ClassicSimilarity non chargée.")
            print("  Config ClassicSimilarity (base)...")
            try: searcher.set_similarity(ClassicSimilarity())
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle base '{model_base}' non reconnu, utilise BM25."); searcher.set_bm25()
        print("  Activation RM3..."); searcher.set_rm3(**rm3_params); print("  RM3 activé.")
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue
                hits = searcher.search(search_text, k=k)
                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche RM3 QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche RM3...")
        if all_results_list:
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites.")
        else: print("\n  Avertissement: Aucun résultat RM3 généré.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")
        end_time = time.time()
        print(f"Recherche RM3 terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run RM3 {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# Lancer la recherche RM3 (après configuration des variables BEST_...)
print("\nLancement de la recherche RM3...")
perform_search_sequential_rm3(
    BEST_QUERIES, BEST_INDEX_PATH, BEST_MODEL_BASE, K_RESULTS,
    PRF_RUN_FILE, RM3_RUN_TAG,
    use_preprocessed_query=USE_PREPROC_QUERY_FOR_RM3, rm3_params=rm3_config
)

print("\n--- Exécution de la recherche RM3 terminée. ---")


# --- Nouvelle Cellule ---

Pour citer ce code :

Loyer, Dominique. (2024). (code propre)_RI_tp1_6avril25.ipynb [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

1mars2018.R

Ce code R charge, explore et nettoie des données de type recensement, puis construit et évalue des modèles de prédiction (arbre de décision et réseau neuronal) pour classer les groupes de revenus.

Mots-clés: données, nettoyage, modélisation, prédiction, R

#Technology: I used R for the analysis (here is the code) 

##Auteur: Dominique Loyer


#Question 1 and 2 and exploration of data

Census <- read.csv("~/Desktop/CensusClean.csv")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- readxl::read_excel("Census.xlsx")
pwc <- read("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)


#Question 3

##Cleaningmytreeadult

###categorical variables as factor
pwcClean$workclass <- as.factor(pwcClean$workclass)
pwcClean$`Income Group` <- as.factor(pwcClean$`Income Group`)
pwcClean$Education <- as.factor(pwcClean$Education)
pwcClean$`Occupation Status` <- as.factor(pwcClean$`Occupation Status`)
pwcClean$Relationship <- as.factor(pwcClean$Relationship)
pwcClean$Gender <- as.factor(pwcClean$Gender)
pwcClean$`Native country` <- as.factor(pwcClean$`Native country`)
summary(pwcClean)
pwcClean$`Marital Status` <- as.factor(pwcClean$`Marital Status`)
summary(pwcClean)
pwcClean$Age=(pwcClean$Age-min(pwcClean$Age))/(max(pwcClean$Age)-min(pwcClean$Age))

range(pwcClean$`Demographic Adjustment`)
hist(pwcClean$`Demographic Adjustment`)

###replacing missing values and merging Income Group (50k and 50k.)

install.packages("tidyr")
library("tidyr")
pwc$`Occupation Status` <- sub("?", "Other-service", pwc$`Occupation Status`)
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
pwc$workclass[pwc$workclass==" ?"] = as.character(sample(pwc$workclass[which(pwc$workclass !=" ?")], 1774, replace = FALSE))
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50less <- subset (pwc, `Income Group` %in% "<=50K")



###normalization with Min-Max
pwcClean$`Demographic Adjustment`=(pwcClean$`Demographic Adjustment`-min(pwcClean$`Demographic Adjustment`))/(max(pwcClean$`Demographic Adjustment`)-min(pwcClean$`Demographic Adjustment`))
hist(pwcClean$`Demographic Adjustment`)
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education`))
pwcClean$`capital-gain`=(pwcClean$`capital-gain`-min(pwcClean$`capital-gain`))/(max(pwcClean$`capital-gain`)-min(pwcClean$`capital-gain`))
pwcClean$`capital-loss`=(pwcClean$`capital-loss`-min(pwcClean$`capital-loss`))/(max(pwcClean$`capital-loss`)-min(pwcClean$`capital-loss`))
pwcClean$`hours-per-week`=(pwcClean$`hours-per-week`-min(pwcClean$`hours-per-week`))/(max(pwcClean$`hours-per-week`)-min(pwcClean$`hours-per-week`))

###Normalized and cleaned data
pwcClean.normalized <- pwcClean
pwcClean.normalizedCopy <- pwcClean.normalized



#Decision Tree
##pruning the tree with rpart
install.packages("rpart")
library(rpart)
library(rpart.plot)
mytreeadult=rpart(`Income Group`~Education+workclass+`Occupation Status`+Gender+Age+`hours-per-week`
                  , data=pwcClean.normalized, method="class", control=rpart.control(minsplit=1))
mytreeadult
plot(mytreeadult)
rpart.plot(mytreeadult, type=3, extra=101, fallen.leaves = TRUE)
text(mytreeadult, use.n=T, all=T, pretty=0, cex=0.9, xpd=TRUE)
estincome.class=predict(mytreeadult, data=pwcClean.normalized, type="class")

##cross-validation table

t1=table(`Income Group`, estincome.class)
t1
(10061+1354)+(11564+1505)/30511
(10061+1354+11564+1505)/30511



###Training data 20000 out of 30511
index <- sample(1:nrow(pwcClean.normalized), 20000)
pwc.test = pwcClean.normalized[index,]
pwc.valid = pwcClean.normalized[-index,]

###validation data
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc
(7457+1519)/(30511-20000)

#Neural network with 10 hidden layers

require(nnet)
library(nnet)
set.seed(99999999)

pwc.net = nnet(`Income Group`~.,data=pwc.test,size=10)
pwc.valid$est.income = predict(pwc.net,pwc.valid,type="class")
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc


clear(
  
)
rpart(formula, data=, method=,control=)
###
rpart(formula, data=, method=,control=)
fit<-rpart(Default_On_Payment~Status_Checking_Acc+Duration_in_Months+Credit_History+Purposre_Credit_Taken+
              Credit_Amount+Savings_Acc+Years_At_Present_Employment+Inst_Rt_Income+Marital_Status_Gender+
              Other_Debtors_Guarantors+Current_Address_Yrs+Property+Age+Other_Inst_Plans+Housing+
              Num_CC+Job+Dependents+Telephone+Foreign_Worker,
             data=cust_data, method="class",
           control=rpart.control(minsplit=20, cp=0.01))











Pour citer ce code :

Loyer, Dominique. (2024). 1mars2018.R [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

ARIMA.R

Ce code R importe des données de ventes, les transforme en série chronologique, les décompose en composantes saisonnière, de tendance et de résidus, puis ajuste un modèle ARIMA automatique pour générer et visualiser des prévisions.

Mots-clés: Séries chronologiques, Prévision, ARIMA, Décomposition, Analyse

#Setting working directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Time_Series_Accelerator")

library("forecast")

#Reading the data
sales <- read.csv("sales.csv")

#What kind of data we have
sales
head(sales)
tail(sales)

#Create time series from the input data, [,1] is for first column and all rows. freq = 12, is for 12 months. For quarters it will be freq = 4
sales <- ts(sales[,1],start=1995,freq=12)

write.csv(sales,"newsales.csv")

#Let's view, what is the output
sales

#Plot Time Series
plot(sales)

#Divides into Seasonal, Trend and Remainder. S.Window controls how rapidly the seasonal component can change

apts <- decompose(sales)

plot (apts)

# Select Best Model

arima_model <- auto.arima(sales)

arima_model

error_estimate_ARIMA <- accuracy(arima_model)

forecast(arima_model, 20)

plot(forecast(arima_model,20))

# ME: Mean Error
# RMSE: Root Mean Squared Error
# MAE: Mean Absolute Error
# MPE: Mean Percentage Error
# MAPE: Mean Absolute Percentage Error
# MASE: Mean Absolute Scaled Error
# ACF1: Autocorrelation of errors at lag 1.


Pour citer ce code :

Loyer, Dominique. (2024). ARIMA.R [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

backend_7juin25.html

Une interface web interactive qui permet aux utilisateurs de soumettre une URL ou du texte pour que son niveau de crédibilité soit analysé par un service backend et affiché via une jauge et des détails.

Mots-clés: crédibilité, information, évaluation, interface web, frontend

<!DOCTYPE html>
<html lang="fr">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Évaluation de Crédibilité</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <!-- Le script pour les icônes sera déplacé à la fin du body pour un chargement optimal -->
    <style>
        /* Style personnalisé pour la jauge */
        .gauge-container {
            width: 200px;
            height: 100px;
            position: relative;
            overflow: hidden;
            border-radius: 100px 100px 0 0;
        }
        .gauge-background {
            width: 100%;
            height: 100%;
            background: linear-gradient(to right, #ef4444, #eab308, #22c55e); /* Red-Yellow-Green */
            position: absolute;
            top: 0;
            left: 0;
        }
        .gauge-mask {
            width: 100%;
            height: 100%;
            background-color: #f3f4f6; /* bg-gray-100 */
            position: absolute;
            top: 0;
            left: 0;
            transform-origin: bottom center;
            /* La rotation sera ajustée par JS */
            transition: transform 0.5s ease-in-out;
        }
        .gauge-center {
            width: 160px;
            height: 80px;
            background-color: #f3f4f6; /* bg-gray-100 */
            position: absolute;
            bottom: 0;
            left: 20px;
            border-radius: 80px 80px 0 0;
            display: flex;
            flex-direction: column;
            justify-content: flex-end;
            align-items: center;
            padding-bottom: 5px;
        }
        .gauge-score {
            font-size: 1.5rem; /* text-2xl */
            font-weight: bold;
        }
        .gauge-label {
            font-size: 0.75rem; /* text-xs */
            color: #6b7280; /* text-gray-500 */
        }
        /* Style pour surligner les mots (explication LIME) */
        .highlight-positive { background-color: rgba(34, 197, 94, 0.3); padding: 0 2px; border-radius: 3px; }
        .highlight-negative { background-color: rgba(239, 68, 68, 0.3); padding: 0 2px; border-radius: 3px; }

        /* Style pour les barres simples */
        .bar-container {
            height: 10px;
            background-color: #e5e7eb; /* bg-gray-200 */
            border-radius: 5px;
            overflow: hidden;
            width: 100px; /* Ajuster si nécessaire */
        }
        .bar {
            height: 100%;
            border-radius: 5px;
            transition: width 0.5s ease-in-out;
        }
    </style>
</head>
<body class="bg-gray-100 font-sans p-4 md:p-8">

    <div class="container mx-auto max-w-3xl bg-white shadow-lg rounded-lg p-6 md:p-8">

        <h1 class="text-2xl md:text-3xl font-bold text-center text-gray-800 mb-6">
            Système d'Évaluation de la Crédibilité de l'Information
        </h1>

        <div class="mb-6">
            <label for="inputData" class="block text-sm font-medium text-gray-700 mb-2">
                Entrez une URL ou collez du texte :
            </label>
            <textarea id="inputData" rows="4" class="w-full p-3 border border-gray-300 rounded-md focus:ring-2 focus:ring-indigo-500 focus:border-indigo-500 transition duration-150 ease-in-out" placeholder="Ex: https://www.example.com ou 'Ce texte semble suspect...'"></textarea>
            <button id="verifyButton" class="mt-3 w-full inline-flex justify-center items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500 transition duration-150 ease-in-out disabled:opacity-50">
                <i data-lucide="search" class="mr-2 h-5 w-5"></i> Vérifier la Crédibilité
            </button>
        </div>

        <div id="reportSection" class="hidden mt-8 border-t border-gray-200 pt-6">
            <h2 class="text-xl md:text-2xl font-semibold text-gray-800 mb-4 text-center">Rapport d'Analyse</h2>

            <div class="flex flex-col items-center mb-6 p-4 bg-gray-50 rounded-lg">
                <h3 class="text-lg font-medium text-gray-700 mb-2">Score de Crédibilité Global</h3>
                <div class="gauge-container mb-2">
                    <div class="gauge-background"></div>
                    <div id="gaugeMask" class="gauge-mask"></div>
                    <div class="gauge-center">
                        <span id="gaugeScore" class="gauge-score">--</span>
                        <span class="gauge-label">0 = Faible, 10 = Élevé</span>
                    </div>
                </div>
                <p id="reportSummary" class="text-center text-gray-600 text-sm italic"></p>
            </div>
             
             <!-- Section des constats (findings) -->
            <div id="findingsSection" class="bg-gray-50 p-4 rounded-lg shadow-sm">
                 <h4 class="font-semibold text-gray-700 mb-3 border-b pb-2 flex items-center">
                    <i data-lucide="list-checks" class="mr-2 h-5 w-5 text-indigo-600"></i> Détails de l'Évaluation
                 </h4>
                 <div id="findingsList" class="space-y-2">
                     <!-- Les constats (findings) du backend seront insérés ici -->
                 </div>
            </div>

        </div> 
        
        <div id="loadingIndicator" class="hidden text-center mt-6">
             <div class="inline-flex items-center px-4 py-2 font-semibold leading-6 text-sm shadow rounded-md text-indigo-700 bg-white transition ease-in-out duration-150 cursor-not-allowed">
                 <svg class="animate-spin -ml-1 mr-3 h-5 w-5 text-indigo-500" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
                     <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
                     <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
                 </svg>
                 Analyse en cours...
            </div>
        </div>

        <div id="errorSection" class="hidden mt-6 p-4 bg-red-100 border border-red-400 text-red-700 rounded-lg">
             <h4 class="font-bold flex items-center"><i data-lucide="alert-triangle" class="mr-2 h-5 w-5"></i> Erreur</h4>
             <p id="errorMessage" class="text-sm"></p>
        </div>

    </div>

    <!-- ===== CORRECTION : Scripts déplacés à la fin du body ===== -->
    <script src="https://cdn.jsdelivr.net/npm/lucide-static@latest/dist/lucide.min.js"></script>
    <script>
        // On attend que le DOM soit entièrement chargé avant d'exécuter le script.
        document.addEventListener('DOMContentLoaded', function () {
            // --- Éléments DOM ---
            const inputDataEl = document.getElementById('inputData');
            const verifyButton = document.getElementById('verifyButton');
            const reportSection = document.getElementById('reportSection');
            const errorSection = document.getElementById('errorSection');
            const errorMessageEl = document.getElementById('errorMessage');
            const loadingIndicator = document.getElementById('loadingIndicator');

            // Éléments du rapport
            const gaugeScoreEl = document.getElementById('gaugeScore');
            const gaugeMaskEl = document.getElementById('gaugeMask');
            const reportSummaryEl = document.getElementById('reportSummary');
            const findingsListEl = document.getElementById('findingsList');


            // Initialiser Lucide Icons
            lucide.createIcons();

            // --- Logique ---

            verifyButton.addEventListener('click', handleVerificationRequest);

            async function handleVerificationRequest() {
                const inputText = inputDataEl.value.trim();
                if (!inputText) {
                    showError("Veuillez entrer une URL ou du texte.");
                    return;
                }

                // Afficher le chargement, masquer l'ancien rapport/erreur
                loadingIndicator.classList.remove('hidden');
                reportSection.classList.add('hidden');
                errorSection.classList.add('hidden');
                verifyButton.disabled = true;

                try {
                    // =================================================================
                    // === MODIFICATION CLÉ : APPEL RÉEL AU SERVEUR (BACKEND) ===
                    // =================================================================
                    
                    // METTEZ VOTRE URL NGROK ICI !
                    const apiUrl = 'https://37e9-34-136-178-210.ngrok-free.app/api/verify'; // <--- REMPLACEZ PAR VOTRE URL NGROK

                    const response = await fetch(apiUrl, {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
                        body: JSON.stringify({ input_data: inputText })
                    });

                    if (!response.ok) {
                        const errorData = await response.json();
                        throw new Error(errorData.error || `Erreur serveur: ${response.status}`);
                    }
                    const reportData = await response.json();
                    
                    // Afficher le rapport
                    displayReport(reportData);
                    reportSection.classList.remove('hidden');

                } catch (error) {
                    console.error("Erreur lors de la vérification:", error);
                    showError(error.message || "Une erreur inconnue est survenue. Vérifiez l'URL du serveur et que celui-ci est bien en cours d'exécution.");
                } finally {
                    // Masquer le chargement, réactiver le bouton
                    loadingIndicator.classList.add('hidden');
                    verifyButton.disabled = false;
                }
            }

            function displayReport(report) {
                // 1. Score Global et Résumé
                const score = report.final_score; // Le score est maintenant sur 10
                gaugeScoreEl.textContent = score.toFixed(1);
                // Calculer l'angle pour le masque de la jauge (0 score = 0 deg, 10 score = 180 deg)
                const rotation = Math.max(0, Math.min(180, (10 - score) / 10 * 180));
                gaugeMaskEl.style.transform = `rotate(${rotation}deg)`;
                reportSummaryEl.textContent = report.summary || "Résumé non disponible.";
                
                // 2. Afficher les constats (findings)
                findingsListEl.innerHTML = ''; // Vider la liste précédente
                if (report.findings && report.findings.length > 0) {
                    report.findings.forEach(finding => {
                        const impactColor = finding.impact >= 0 ? 'text-green-600' : 'text-red-600';
                        const bgColor = finding.impact >= 0 ? 'bg-green-50' : 'bg-red-50';
                        const icon = finding.impact >= 0 
                            ? `<i data-lucide="arrow-up-circle" class="h-5 w-5 text-green-500"></i>`
                            : `<i data-lucide="arrow-down-circle" class="h-5 w-5 text-red-500"></i>`;

                        const findingElement = document.createElement('div');
                        findingElement.className = `p-2 ${bgColor} rounded-md flex items-start space-x-3`;
                        findingElement.innerHTML = `
                            <div class="flex-shrink-0 pt-0.5">${icon}</div>
                            <div class="flex-1">
                                <p class="text-sm font-semibold text-gray-800">${finding.source}</p>
                                <p class="text-sm text-gray-600">${finding.description}</p>
                                ${finding.evidence ? `<p class="text-xs text-gray-500 mt-1"><em>Évidence: ${finding.evidence}</em></p>` : ''}
                            </div>
                            <div class="font-semibold text-sm ${impactColor}">
                                ${finding.impact > 0 ? '+' : ''}${(finding.impact * 5).toFixed(1)}
                            </div>
                        `;
                        findingsListEl.appendChild(findingElement);
                    });
                } else {
                    findingsListEl.innerHTML = `<p class="text-sm text-gray-500 text-center">Aucun détail spécifique n'a été généré pour cette analyse.</p>`;
                }
                lucide.createIcons(); // Recréer les icônes ajoutées dynamiquement
            }

            // --- Gestion des erreurs ---
            function showError(message) {
                errorMessageEl.textContent = message;
                errorSection.classList.remove('hidden');
                reportSection.classList.add('hidden'); // Masquer le rapport si erreur
            }
        });
    </script>
</body>
</html>

Pour citer ce code :

Loyer, Dominique. (2024). backend_7juin25.html [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

Cart_ClassificationTree_V1.R

Ce code R construit, élague et visualise un arbre de décision de classification pour prédire le défaut de paiement des clients à partir de données financières.

Mots-clés: arbre de décision, classification, défaut de paiement, r, rpart

###
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Decision_Tree_Accelerator")
library(rpart)
##Read the data in the file
cust_data<-read.csv("Default_On_Payment_CHAID.csv")

###rpart(formula, data=, method=,control=) 
# fit<-rpart(Default_On_Payment~Status_Checking_Acc+Duration_in_Months+Credit_History+Purposre_Credit_Taken+
#              Credit_Amount+Savings_Acc+Years_At_Present_Employment+Inst_Rt_Income+Marital_Status_Gender+
#              Other_Debtors_Guarantors+Current_Address_Yrs+Property+Age+Other_Inst_Plans+Housing+
#              Num_CC+Job+Dependents+Telephone+Foreign_Worker,
#             data=cust_data, method="class", 
#            control=rpart.control(minsplit=20, cp=0.01))


fit<-rpart(Default_On_Payment~Status_Checking_Acc+Credit_History, data=cust_data, method="class", 
           control=rpart.control(minsplit=50, cp=0.001))


##display complexity parameter table
printcp(fit)

###plot cross-validation results
plotcp(fit)

###detailed results including splits
summary(fit)
printcp(fit)

###Prune the tree to the desired size ..at min error cp
pfit<- prune(fit, cp = 0.001)

###plot decision tree 
plot(pfit, uniform=TRUE, main="Classification Tree for Default_on_payment")

###label the decision tree plot 
text(pfit,splits = TRUE, use.n=TRUE, all=TRUE, cex=0.5, pretty=1)
labels(pfit)
library(rpart.plot)

prp(pfit, type=4, extra=4, under=TRUE)


Pour citer ce code :

Loyer, Dominique. (2024). Cart_ClassificationTree_V1.R [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

check_env.py

Ce script Python affiche des informations diagnostiques sur l'environnement d'exécution, notamment le chemin de l'exécutable Python et les chemins de recherche des modules.

Mots-clés: Python, environnement, diagnostic, chemins, modules

# check_env.py
import sys
import os

print("--- DIAGNOSTIC DE L'ENVIRONNEMENT PYTHON ---")
print(f"Chemin de l'exécutable Python (sys.executable) : {sys.executable}")
print("\nChemins où Python cherche les paquets (sys.path):")
for path in sys.path:
    print(f"- {path}")
print("\n--- FIN DU DIAGNOSTIC ---")

Pour citer ce code :

Loyer, Dominique. (2024). check_env.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

chercheur110725.py

Une application Tkinter interactive qui teste l'ajout séquentiel de composants d'interface utilisateur, comme une barre de recherche et un tableau de résultats, pour diagnostiquer les problèmes de rendu.

Mots-clés: Tkinter, interface graphique, diagnostic, test, composants

# -*- coding: utf-8 -*-

import tkinter as tk
from tkinter import ttk, messagebox

class InteractiveTester:
    def __init__(self, root):
        self.root = root
        self.root.title("Outil de Diagnostic Interactif (v14)")
        self.root.geometry("900x700")
        self.root.configure(bg='#f0f0f0')

        self.status_label = tk.Label(
            self.root,
            text="Prêt. Cliquez sur les boutons pour construire l'interface.",
            bg='#d3d3d3', # Gris pour le différencier
            fg='black'
        )
        self.status_label.pack(fill=tk.X, side=tk.BOTTOM)

        # --- Cadre pour les boutons de test ---
        button_frame = tk.Frame(self.root, bg='#c0c0c0', padx=10, pady=10)
        button_frame.pack(fill=tk.X, side=tk.TOP)

        self.b1 = tk.Button(button_frame, text="1. Ajouter la barre de recherche", command=self.add_search_bar)
        self.b1.pack(side=tk.LEFT, padx=5)

        self.b2 = tk.Button(button_frame, text="2. Ajouter le tableau de résultats", command=self.add_results_table, state=tk.DISABLED)
        self.b2.pack(side=tk.LEFT, padx=5)
        
        # Cadre où les widgets seront ajoutés
        self.content_frame = tk.Frame(self.root, bg='#f0f0f0')
        self.content_frame.pack(expand=True, fill=tk.BOTH)


    def add_search_bar(self):
        """Teste l'ajout de la barre de recherche."""
        try:
            self.status_label.config(text="Ajout de la barre de recherche...")
            
            # Utilisation de .place() pour un positionnement absolu et simple
            tk.Label(self.content_frame, text="Mots-clés:", bg='#f0f0f0', fg='black').place(x=10, y=10)
            
            tk.Entry(self.content_frame, bg='white', fg='black', insertbackground='black').place(x=90, y=10, width=300)
            
            tk.Button(self.content_frame, text="Rechercher", highlightbackground='#f0f0f0').place(x=400, y=10)
            
            self.status_label.config(text="Barre de recherche ajoutée avec succès ! Passez à l'étape 2.")
            self.b1.config(state=tk.DISABLED, text="1. Barre de recherche OK")
            self.b2.config(state=tk.NORMAL)
            
        except Exception as e:
            messagebox.showerror("Erreur", f"Échec lors de l'ajout de la barre de recherche: {e}")
            self.status_label.config(text=f"Erreur à l'étape 1: {e}")

    def add_results_table(self):
        """Teste l'ajout du tableau de résultats (Treeview)."""
        try:
            self.status_label.config(text="Ajout du tableau de résultats...")
            
            # Cadre pour le tableau
            tree_frame = tk.Frame(self.content_frame)
            tree_frame.place(x=10, y=50, relwidth=0.97, relheight=0.9)

            style = ttk.Style()
            style.theme_use('aqua')
            
            tree = ttk.Treeview(tree_frame, columns=('col1', 'col2'), show='headings')
            tree.heading('col1', text='Colonne 1')
            tree.heading('col2', text='Colonne 2')
            
            vsb = ttk.Scrollbar(tree_frame, orient="vertical", command=tree.yview)
            tree.configure(yscrollcommand=vsb.set)

            tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
            vsb.pack(side=tk.RIGHT, fill=tk.Y)
            
            tree.insert('', 'end', values=('Test 1', 'Donnée A'))
            tree.insert('', 'end', values=('Test 2', 'Donnée B'))

            self.status_label.config(text="TOUT A FONCTIONNÉ ! Le problème est ailleurs.")
            self.b2.config(state=tk.DISABLED, text="2. Tableau OK")

        except Exception as e:
            messagebox.showerror("Erreur", f"ÉCHEC LORS DE L'AJOUT DU TABLEAU ! C'est le composant fautif.\n\nErreur: {e}")
            self.status_label.config(text=f"Erreur à l'étape 2: {e}")


if __name__ == "__main__":
    main_root = tk.Tk()
    app = InteractiveTester(main_root)
    main_root.mainloop()

Pour citer ce code :

Loyer, Dominique. (2024). chercheur110725.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

classifierv1806.py

Ce script Python organise automatiquement les fichiers d'un dossier, détecte les doublons et utilise une IA (Gemini) ainsi qu'une mémoire persistante pour la classification des fichiers, avec une validation utilisateur des catégories.

Mots-clés: organisation, fichiers, doublons, classification IA, mémoire

import os
import sys
import hashlib
import shutil
import json
import requests
from collections import defaultdict
import time

# --- Configuration de la Mémoire ---
MEMORY_FILE = os.path.join(os.path.expanduser('~'), 'Desktop', 'organizer_memory.json')

# =============================================================================
# --- FONCTIONS DE GESTION DE LA MÉMOIRE ---
# =============================================================================

def load_memory():
    """Charge la mémoire (catégories, map de noms, et map de hash)."""
    if os.path.exists(MEMORY_FILE):
        print("\nINFO: Fichier de mémoire trouvé. Chargement des connaissances antérieures...")
        with open(MEMORY_FILE, 'r', encoding='utf-8') as f:
            try:
                memory = json.load(f)
                # Assure que toutes les clés nécessaires existent
                memory.setdefault("learned_categories", [])
                memory.setdefault("classification_map_by_name", {})
                memory.setdefault("classification_map_by_hash", {})
                return memory
            except json.JSONDecodeError:
                pass # Fichier corrompu, on en crée un nouveau
    print("\nINFO: Aucun fichier de mémoire trouvé. Démarrage avec une table rase.")
    return {"learned_categories": [], "classification_map_by_name": {}, "classification_map_by_hash": {}}

def save_memory(data):
    """Sauvegarde la mémoire dans le fichier JSON."""
    print("\nMise à jour de la mémoire avec les nouvelles connaissances...")
    try:
        with open(MEMORY_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        print(f"Mémoire sauvegardée avec succès dans : {MEMORY_FILE}")
    except Exception as e:
        print(f"[ERREUR] Impossible de sauvegarder la mémoire : {e}")

# =============================================================================
# --- ÉTAPES 1 & 2 (DOUBLONS) ---
# Le code est identique aux versions précédentes.
# =============================================================================

def calculate_hash(filepath, block_size=65536):
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except (IOError, FileNotFoundError):
        return None

def find_duplicates(folder):
    print(f"\n--- ÉTAPE 1: Recherche des doublons dans : {folder} ---")
    hashes = defaultdict(list)
    excluded_dirs = ["Doublons", "Classement_Final"]
    files_to_scan = [os.path.join(dp, fn) for dp, dn, fns in os.walk(folder) for fn in fns if not any(excluded in dp for excluded in excluded_dirs)]
    total_files = len(files_to_scan)
    if total_files == 0: return {}, []
    for i, filepath in enumerate(files_to_scan):
        try:
            sys.stdout.write(f"\rAnalyse des fichiers : {((i + 1) / total_files) * 100:.1f}%")
            sys.stdout.flush()
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash: hashes[file_hash].append(filepath)
        except (FileNotFoundError, OSError): continue
    print("\nAnalyse des doublons terminée.")
    duplicates = {h: p for h, p in hashes.items() if len(p) > 1}
    unique_files = [p[0] for h, p in hashes.items() if len(p) == 1]
    return duplicates, unique_files

def get_readability_score(filename):
    score = len(filename)
    if any(c.isalpha() for c in filename): score += 20
    if filename.split('.')[0].isdigit(): score -= 10
    return score

def process_duplicates_safely(duplicates, base_folder, desktop_path):
    if not duplicates:
        print("\n--- ÉTAPE 2: Traitement des doublons ---\nBonne nouvelle ! Aucun fichier en double à traiter.")
        return []
    print(f"\n--- ÉTAPE 2: Traitement automatique et protégé de {len(duplicates)} groupes de doublons ---")
    duplicates_folder = os.path.join(base_folder, "Doublons")
    os.makedirs(duplicates_folder, exist_ok=True)
    print(f"Les doublons non-protégés seront déplacés dans : {duplicates_folder}")
    files_kept = []
    for group_paths in duplicates.values():
        try:
            if desktop_path and any(p.startswith(desktop_path) for p in group_paths):
                file_to_keep = max((p for p in group_paths if p.startswith(desktop_path)), key=os.path.getmtime)
                final_path = file_to_keep
            else:
                file_to_keep = max(group_paths, key=os.path.getmtime)
                best_name = os.path.basename(max(group_paths, key=lambda p: get_readability_score(os.path.basename(p))))
                original_dir = os.path.dirname(file_to_keep)
                potential_new_path = os.path.join(original_dir, best_name)
                final_path = file_to_keep
                if file_to_keep != potential_new_path and not os.path.exists(potential_new_path):
                    os.rename(file_to_keep, potential_new_path)
                    final_path = potential_new_path
            files_kept.append(final_path)
            for path in group_paths:
                if path != file_to_keep and not (desktop_path and path.startswith(desktop_path)):
                    try: shutil.move(path, os.path.join(duplicates_folder, os.path.basename(path)))
                    except Exception as e: print(f"Erreur déplacement doublon {os.path.basename(path)}: {e}")
        except Exception as e: print(f"\nErreur traitement groupe doublons : {e}")
    print("\nTraitement des doublons terminé.")
    return files_kept

# =============================================================================
# --- ÉTAPE 3: CLASSIFICATION IA (AVEC SUPER-MÉMOIRE) ---
# =============================================================================

def learn_structure_from_path(path_to_learn):
    learned_categories = set()
    if not os.path.isdir(path_to_learn): return learned_categories
    print(f"Analyse de la structure de : {path_to_learn}...")
    for root, dirs, _ in os.walk(path_to_learn):
        for dir_name in dirs:
            if dir_name.startswith('.') or dir_name.lower() in ["attachments", "files", "images"]: continue
            clean_name = dir_name.replace('_', ' ').strip()
            if len(clean_name) > 3: learned_categories.add(clean_name)
    return learned_categories

def propose_categories_with_ai(files_for_ai, learned_categories, api_key):
    if not files_for_ai: return {}
    print(f"\n--- ÉTAPE 3b: Interrogation de l'IA pour {len(files_for_ai)} nouveaux fichiers ---")
    BATCH_SIZE, MAX_RETRIES, BACKOFF_FACTOR = 75, 3, 2
    ai_classifications = {}
    total_files = len(files_for_ai)
    for i in range(0, total_files, BATCH_SIZE):
        batch_files = files_for_ai[i:i + BATCH_SIZE]
        batch_filenames = [os.path.basename(f) for f in batch_files]
        current_batch_num = (i // BATCH_SIZE) + 1
        total_batches = (total_files + BATCH_SIZE - 1) // BATCH_SIZE
        print(f"\nTraitement du lot {current_batch_num}/{total_batches}...")
        prompt = (f"En te basant sur cette liste de catégories apprises : {list(learned_categories)}. Pour la liste de noms de fichiers suivante : {json.dumps(batch_filenames)}. Propose la catégorie la plus pertinente pour CHAQUE fichier. Si aucune catégorie ne correspond, invente une catégorie pertinente et concise (2-4 mots). Réponds UNIQUEMENT avec un objet JSON où chaque clé est un nom de fichier et sa valeur est la catégorie proposée.")
        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
        for attempt in range(MAX_RETRIES):
            try:
                response = requests.post(apiUrl, json=payload, timeout=90)
                response.raise_for_status()
                json_text = response.json()['candidates'][0]['content']['parts'][0]['text']
                json_text = json_text.strip().replace("```json", "").replace("```", "")
                batch_classifications = json.loads(json_text)
                ai_classifications.update(batch_classifications)
                print(f"Lot {current_batch_num} traité avec succès.")
                break
            except requests.exceptions.HTTPError as err:
                if err.response.status_code in [500, 503, 504] and (attempt + 1) < MAX_RETRIES:
                    wait_time = BACKOFF_FACTOR * (2 ** attempt)
                    print(f"  -> Échec temporaire (tentative {attempt + 1}/{MAX_RETRIES}): {err.response.status_code}. Nouvelle tentative dans {wait_time}s...")
                    time.sleep(wait_time)
                else: print(f"\n[ERREUR HTTP] Échec final du lot {current_batch_num}. Ce lot sera ignoré."); break
            except Exception as e: print(f"\n[ERREUR] Échec du lot {current_batch_num} : {e}. Ce lot sera ignoré."); break
        time.sleep(1)
    return ai_classifications

# =============================================================================
# --- ÉTAPES 4 & 5 (VALIDATION, ORGANISATION) ---
# =============================================================================

def validate_categories(classification_plan):
    if not classification_plan: return set()
    proposed_categories = sorted(list(set(classification_plan.values())))
    print("\n--- ÉTAPE 4: Validation des catégories proposées ---")
    for i, cat_name in enumerate(proposed_categories): print(f"  {i+1}) {cat_name}")
    validated_categories = set()
    while True:
        try:
            choices_str = input("\nEntrez les numéros des catégories à conserver (ex: 1,3,4), ou 'toutes' : ")
            if choices_str.lower() == 'toutes': return set(proposed_categories)
            chosen_indices = [int(i.strip()) - 1 for i in choices_str.split(',')]
            for index in chosen_indices:
                if 0 <= index < len(proposed_categories): validated_categories.add(proposed_categories[index])
            if validated_categories: print("\nCatégories validées :", ", ".join(validated_categories)); return validated_categories
        except ValueError: print("[ERREUR] Entrée invalide.")

def execute_final_organization(base_folder, classification_plan, validated_categories, desktop_path):
    print("\n--- ÉTAPE 5: Organisation finale des fichiers ---")
    output_root = os.path.join(base_folder, "Classement_Final")
    os.makedirs(output_root, exist_ok=True)
    print(f"Les fichiers seront déplacés et organisés dans : {output_root}")
    for cat in validated_categories: os.makedirs(os.path.join(output_root, cat), exist_ok=True)
    unclassified_dir = os.path.join(output_root, "Non Classé")
    os.makedirs(unclassified_dir, exist_ok=True)
    for i, (original_path, category) in enumerate(classification_plan.items()):
        sys.stdout.write(f"\rDéplacement des fichiers : {((i + 1) / len(classification_plan)) * 100:.1f}%")
        sys.stdout.flush()
        if os.path.exists(original_path) and not (desktop_path and original_path.startswith(desktop_path)):
            target_dir = os.path.join(output_root, category) if category in validated_categories else unclassified_dir
            try: shutil.move(original_path, os.path.join(target_dir, os.path.basename(original_path)))
            except Exception as e: print(f"\n[ERREUR] Impossible de déplacer {original_path}. Erreur: {e}")
    print(f"\n\nL'organisation finale est terminée !\nNote : les fichiers sur votre Bureau n'ont pas été déplacés.")

# =============================================================================
# --- FONCTION PRINCIPALE (main) ---
# =============================================================================

def main():
    try:
        print("--- Assistant d'Organisation v15 (Super-Mémoire) ---")
        memory = load_memory()
        
        home = os.path.expanduser('~')
        desktop_path = next((p for p in [os.path.join(home, 'Desktop'), os.path.join(home, 'Bureau')] if os.path.isdir(p)), "")
        if desktop_path: print(f"Règle de protection activée pour le Bureau : {desktop_path}")
        
        api_key = input("Veuillez coller votre clé API Google AI Studio : ").strip()
        
        obsidian_path = input("1. (Optionnel) Glissez-déposez votre coffre Obsidian pour mettre à jour la mémoire : ").strip("'\"")
        notion_path = input("2. (Optionnel) Glissez-déposez votre export Notion pour mettre à jour la mémoire : ").strip("'\"")
        
        learned_from_folders = learn_structure_from_path(obsidian_path)
        learned_from_folders.update(learn_structure_from_path(notion_path))
        all_learned_categories = set(memory["learned_categories"]) | learned_from_folders
        if all_learned_categories: print(f"\nConnaissances actuelles : {len(all_learned_categories)} catégories mémorisées.")

        folder_path = input("\n3. Maintenant, glissez-déposez le dossier principal à analyser : ").strip("'\"")
        if not os.path.isdir(folder_path): return

        duplicates, unique_files = find_duplicates(folder_path)
        files_kept_after_dedup = process_duplicates_safely(duplicates, folder_path, desktop_path)
        all_files_to_classify = list(set(unique_files + files_kept_after_dedup))
        all_files_to_classify = [f for f in all_files_to_classify if os.path.exists(f)]

        if not all_files_to_classify:
            print("\nIl ne reste aucun fichier à classer. Opération terminée.")
            return

        # --- NOUVELLE LOGIQUE DE CLASSIFICATION AVEC SUPER-MÉMOIRE ---
        print("\n--- ÉTAPE 3a: Consultation de la mémoire pour la classification ---")
        classification_plan = {}
        files_for_ai = []
        map_by_name = memory["classification_map_by_name"]
        map_by_hash = memory["classification_map_by_hash"]
        
        for i, f_path in enumerate(all_files_to_classify):
            sys.stdout.write(f"\rConsultation mémoire : {((i + 1) / len(all_files_to_classify)) * 100:.1f}%")
            sys.stdout.flush()
            f_name = os.path.basename(f_path)
            
            # Priorité 1: Mémoire par nom de fichier
            if f_name in map_by_name:
                classification_plan[f_path] = map_by_name[f_name]
                continue

            # Priorité 2: Mémoire par contenu (hash)
            file_hash = calculate_hash(f_path)
            if file_hash in map_by_hash:
                classification_plan[f_path] = map_by_hash[file_hash]
                continue
            
            # Si inconnu, on l'ajoute à la liste pour l'IA
            files_for_ai.append(f_path)

        print(f"\n{len(classification_plan)} fichiers classés instantanément grâce à la mémoire.")

        if api_key and files_for_ai:
            ai_results_by_name = propose_categories_with_ai(files_for_ai, all_learned_categories, api_key)
            for f_path in files_for_ai:
                f_name = os.path.basename(f_path)
                classification_plan[f_path] = ai_results_by_name.get(f_name, "Non Classé")
        else:
             for f_path in files_for_ai: classification_plan[f_path] = "Non Classé"

        # --- Validation et Exécution ---
        validated_categories = validate_categories(classification_plan)
        if not validated_categories: print("\nAucune catégorie validée. Opération annulée."); return

        confirm = input("\nPrêt à lancer l'organisation finale ? (o/n) > ").lower()
        if confirm == 'o':
            execute_final_organization(folder_path, classification_plan, validated_categories, desktop_path)
            
            # --- Mise à jour finale de la mémoire ---
            memory["learned_categories"] = sorted(list(validated_categories | all_learned_categories))
            for f_path, category in classification_plan.items():
                if category in validated_categories:
                    f_name = os.path.basename(f_path)
                    f_hash = calculate_hash(f_path)
                    memory["classification_map_by_name"][f_name] = category
                    if f_hash: memory["classification_map_by_hash"][f_hash] = category
            save_memory(memory)
        else:
            print("\nOpération finale annulée. La mémoire n'a pas été mise à jour.")

    except (EOFError, KeyboardInterrupt): print("\n\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). classifierv1806.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

Copie de Backend Flask_app.py

Le code initialise une API Flask pour la vérification de crédibilité via un système externe, gérant les requêtes client et les erreurs d'importation du module de vérification.

Mots-clés: Flask, API REST, vérification, crédibilité, système

{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true,"base_uri":"https://localhost:8080/"},"id":"Q4bEq6K9LsdE"},"outputs":[{"name":"stdout","output_type":"stream","text":["Error: Could not import CredibilityVerificationSystem.\n","Please ensure 'credibility_system.py' exists and is in the correct path.\n","Current sys.path: ['/content', '/env/python', '/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.11/dist-packages/IPython/extensions', '/usr/local/lib/python3.11/dist-packages/setuptools/_vendor', '/root/.ipython']\n","CredibilityVerificationSystem module not found. Flask app will start, but /api/verify will not work.\n","Flask app created and CORS enabled.\n","Starting Flask development server...\n"," * Serving Flask app '__main__'\n"," * Debug mode: on\n"]},{"name":"stderr","output_type":"stream","text":["INFO:werkzeug:\u001b[31m\u001b[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.\u001b[0m\n"," * Running on all addresses (0.0.0.0)\n"," * Running on http://127.0.0.1:5000\n"," * Running on http://172.28.0.12:5000\n","INFO:werkzeug:\u001b[33mPress CTRL+C to quit\u001b[0m\n","INFO:werkzeug: * Restarting with stat\n"]}],"source":["# Import necessary libraries\n","from flask import Flask, request, jsonify\n","# Install flask_cors if not already installed\n","try:\n","    from flask_cors import CORS # To handle Cross-Origin Resource Sharing\n","except ImportError:\n","    print(\"flask_cors not found. Installing...\")\n","    !pip install flask_cors\n","    from flask_cors import CORS\n","    print(\"flask_cors installed successfully.\")\n","\n","import sys\n","import os\n","import traceback # To help with debugging errors\n","\n","# --- Configuration Import ---\n","# Add the directory containing 'credibility_system.py' to the Python path\n","# Adjust this path if your file structure is different\n","# Assumes 'credibility_system.py' is in the same directory as 'app.py'\n","# If it's elsewhere, provide the correct path.\n","# Example: sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'path_to_your_module')))\n","\n","# Initialize credibility_checker to None, so we can check if it was successfully imported\n","credibility_checker = None\n","\n","try:\n","    # Assuming your CredibilityVerificationSystem class is in a file named 'credibility_system.py'\n","    # in the same directory as this 'app.py' file.\n","    from credibility_system import CredibilityVerificationSystem\n","    print(\"Successfully imported CredibilityVerificationSystem.\")\n","    # --- Instantiate the System ---\n","    # Load models only once when the server starts\n","    # This can take time, so the server might be slow to start initially.\n","    print(\"Initializing CredibilityVerificationSystem... (This might take a moment)\")\n","    credibility_checker = CredibilityVerificationSystem()\n","    print(\"CredibilityVerificationSystem initialized.\")\n","\n","except ImportError:\n","    print(\"Error: Could not import CredibilityVerificationSystem.\")\n","    print(\"Please ensure 'credibility_system.py' exists and is in the correct path.\")\n","    print(f\"Current sys.path: {sys.path}\")\n","    # Removed sys.exit() to avoid stopping the notebook execution\n","    # The credibility_checker will remain None, and the API endpoint\n","    # will need to handle this case.\n","    print(\"CredibilityVerificationSystem module not found. Flask app will start, but /api/verify will not work.\")\n","except Exception as e:\n","    print(f\"Error initializing CredibilityVerificationSystem: {e}\")\n","    traceback.print_exc() # Print detailed error traceback\n","    # Removed sys.exit()\n","    print(\"Failed to initialize credibility system. Flask app will start, but /api/verify will not work.\")\n","\n","\n","# --- Flask App Initialization ---\n","app = Flask(__name__)\n","# IMPORTANT: Enable CORS to allow requests from your HTML file (frontend)\n","# For development, allow all origins ('*'). For production, restrict this\n","# to the actual domain where your frontend is hosted.\n","CORS(app)\n","print(\"Flask app created and CORS enabled.\")\n","\n","# --- API Route Definition ---\n","@app.route('/api/verify', methods=['POST'])\n","def verify_endpoint():\n","    \"\"\"\n","    API endpoint to receive input data and return credibility analysis.\n","    Expects a JSON payload with the key 'input_data'.\n","    \"\"\"\n","    print(\"\\nReceived request on /api/verify\")\n","\n","    # Check if the credibility system was initialized successfully\n","    if credibility_checker is None:\n","        print(\"Error: Credibility verification system not initialized.\")\n","        return jsonify({\"error\": \"Credibility verification system is not available.\"}), 503 # Service Unavailable\n","\n","\n","    # 1. Get data from the request\n","    if not request.is_json:\n","        print(\"Error: Request is not JSON\")\n","        return jsonify({\"error\": \"Request must be JSON\"}), 400 # Bad Request\n","\n","    data = request.get_json()\n","    input_data = data.get('input_data', None)\n","\n","    if not input_data or not isinstance(input_data, str) or not input_data.strip():\n","        print(\"Error: 'input_data' is missing or invalid\")\n","        return jsonify({\"error\": \"'input_data' field is required and must be a non-empty string\"}), 400\n","\n","    print(f\"Processing input: {input_data[:100]}...\") # Log received data (truncated)\n","\n","    # 2. Call the credibility verification system\n","    try:\n","        # Use the pre-initialized checker instance\n","        results = credibility_checker.verify_information(input_data)\n","        print(\"Verification successful.\")\n","        # Check if the verification itself returned an error structure\n","        if isinstance(results, dict) and 'error' in results:\n","             print(f\"Verification returned an error: {results['error']}\")\n","             # Return the specific error from the verification logic\n","             # Use 400 (Bad Request) or potentially 500 (Internal Server Error)\n","             # depending on the nature of the error. 400 is often suitable if\n","             # the input caused the issue (e.g., invalid URL, empty text after processing).\n","             return jsonify(results), 400\n","\n","        # 3. Return the results as JSON\n","        print(f\"Returning report with score: {results.get('scoreCredibilite', 'N/A')}\")\n","        return jsonify(results), 200 # OK\n","\n","    except Exception as e:\n","        # Catch any unexpected errors during verification\n","        print(f\"Error during verification process: {e}\")\n","        traceback.print_exc() # Log the full error traceback for debugging\n","        # Return a generic server error message\n","        return jsonify({\"error\": \"An internal server error occurred during analysis.\"}), 500 # Internal Server Error\n","\n","\n","# --- Run the App (for development) ---\n","if __name__ == '__main__':\n","    # Runs the Flask development server.\n","    # host='0.0.0.0' makes it accessible on your network (useful for testing from other devices)\n","    # debug=True provides automatic reloading and more detailed error pages (DO NOT use in production)\n","    print(\"Starting Flask development server...\")\n","    # Flask's development server might restart multiple times when debug=True,\n","    # leading to re-execution of the code cell. This is normal behavior.\n","    app.run(host='0.0.0.0', port=5000, debug=True)"]}],"metadata":{"colab":{"name":"","provenance":[{"file_id":"1H6oPi2o3xKgNC1ZYv5rDnekxNJ-bYkHC","timestamp":1745620655445}],"version":""},"kernelspec":{"display_name":"Python 3","name":"python3"}},"nbformat":4,"nbformat_minor":0}

Pour citer ce code :

Loyer, Dominique. (2024). Copie de Backend Flask_app.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

dc706.py

Ce script Python organise des fichiers en identifiant et suggérant la suppression des doublons, en les renommant intelligemment, puis en les classant et copiant dans des dossiers thématiques basés sur des catégories de cours UQAM, potentiellement via une API d'IA.

Mots-clés: fichiers, doublons, classement, organisation, UQAM

import os
import sys
import hashlib
import time
import json
import requests
import shutil  # Module pour copier les fichiers
from collections import defaultdict

# --- Fonctions de base (trouver les doublons) ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def find_duplicates(folder):
    """Trouve les fichiers en double."""
    print(f"\nÉtape 1: Recherche des doublons dans : {folder}")
    hashes = defaultdict(list)
    total_files = sum(len(files) for _, _, files in os.walk(folder))
    scanned_files = 0

    if total_files == 0:
        print("\nDossier vide ou inaccessible.")
        return {}

    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            progress = (scanned_files / total_files) * 100
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nRecherche des doublons terminée.\n")
    return {h: p for h, p in hashes.items() if len(p) > 1}

# --- Fonctions de renommage et suggestion de suppression ---

def get_readability_score(filename):
    """Attribue un score à un nom de fichier."""
    score = len(filename)
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def get_best_filename(filepaths):
    """Choisit le meilleur nom de fichier."""
    return os.path.basename(max(filepaths, key=lambda p: (get_readability_score(os.path.basename(p)), len(p))))

def process_duplicates_and_suggest_deletions(duplicates):
    """Effectue le renommage intelligent et liste les doublons à supprimer."""
    if not duplicates:
        print("\nBonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return []

    print(f"Étape 2: Traitement de {len(duplicates)} groupes de doublons (Renommage intelligent)")
    
    files_to_delete = []
    files_kept = []

    for group_paths in duplicates.values():
        group_paths.sort()
        original_to_keep = group_paths[0]
        original_dir = os.path.dirname(original_to_keep)
        
        best_name = get_best_filename(group_paths)
        final_path = os.path.join(original_dir, best_name)

        if original_to_keep != final_path:
            try:
                if os.path.exists(final_path):
                    final_path = original_to_keep
                else:
                    os.rename(original_to_keep, final_path)
                    print(f"  - Renommé : '{os.path.basename(original_to_keep)}' -> '{best_name}'")
            except OSError as e:
                print(f"  - ERREUR de renommage : {e}. Le nom original est conservé.")
                final_path = original_to_keep
        
        files_kept.append(final_path)

        # Ajoute les autres fichiers à la liste de suppression
        for path_to_delete in group_paths:
            if path_to_delete != original_to_keep:
                files_to_delete.append(path_to_delete)

    print("\nNettoyage des noms terminé.")
    if files_to_delete:
        print("\nLes fichiers suivants sont des doublons et peuvent être supprimés manuellement :")
        for f in files_to_delete:
            print(f"  - {f}")
    
    return files_kept

# --- Fonctions de classement par IA (version améliorée) ---

def classify_file_custom(filepath, categories):
    """Utilise l'IA pour classer un fichier selon des catégories spécifiques UQAM."""
    try:
        filename = os.path.basename(filepath).lower()
        
        # Tentative de classification directe par mots-clés
        for category_name, keywords in categories.items():
            for keyword in keywords:
                if keyword in filename:
                    return category_name

        # Si aucune correspondance, on demande à l'IA
        prompt = (f"Le nom du fichier est '{filename}'. Basé sur les sigles de cours de l'UQAM, "
                  f"classe ce fichier dans l'une de ces catégories : {list(categories.keys())}. "
                  "Exemples: 'projet_dic9345.pdf' va dans 'UQAM - DIC9345 - TALN'. "
                  "'resume_these_v3.docx' va dans 'Thèse de doctorat'. "
                  "Réponds uniquement avec le nom de la catégorie. Si incertain, réponds 'À Classer Manuellement'.")

        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiKey = ""
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
        
        response = requests.post(apiUrl, json=payload, timeout=15)
        response.raise_for_status()
        
        result = response.json()
        category = result.get('candidates')[0]['content']['parts'][0]['text'].strip()
        
        if category in categories:
            return category
        else:
            return "À Classer Manuellement"
            
    except Exception:
        return "À Classer Manuellement"

def classify_and_copy_files(base_folder, all_files):
    """Classe tous les fichiers et en fait une copie dans des dossiers thématiques."""
    print("\nÉtape 3: Classement thématique des fichiers (création de copies)")
    
    # NOUVELLES CATÉGORIES BASÉES SUR VOS IMAGES
    uqam_categories = {
        "UQAM - DIC9251 - Modélisation": ["dic-9251", "dic9251", "modélisation"],
        "UQAM - DIC9335 - Science du web": ["dic-9335", "dic9335", "science du web"],
        "UQAM - DIC9270 - Séminaires": ["dic-9270", "dic9270", "séminaire"],
        "UQAM - DIC9345 - TALN": ["dic-9345", "dic9345", "taln", "langage naturel"],
        "UQAM - DIC9401 - Examen Général": ["dic-9401", "dic9401", "examen général"],
        "UQAM - DIC9411 - Projet Recherche": ["dic-9411", "dic9411", "projet de recherche"],
        "UQAM - DIC9150 - Concepts Fondamentaux": ["dic-9150", "dic9150"],
        "UQAM - DIC9001 - Fondements & Tendances": ["dic-9001", "dic9001"],
        "UQAM - DIC9351 - Apprentissage Machine": ["dic-9351", "dic9351", "apprentissage machine", "machine learning"],
        "Thèse de doctorat": ["thèse", "these", "dic-9500", "dic9500"],
        "À Classer Manuellement": []
    }

    copies_folder = os.path.join(base_folder, "Classement (Copies)")
    os.makedirs(copies_folder, exist_ok=True)
    
    print(f"Les copies classées seront créées dans : {copies_folder}")
    
    total_files_to_classify = len(all_files)
    for i, filepath in enumerate(all_files):
        if not os.path.exists(filepath):
            continue

        progress = ((i + 1) / total_files_to_classify) * 100
        sys.stdout.write(f"\rClassement et copie: {progress:.1f}% - {os.path.basename(filepath)}")
        sys.stdout.flush()

        category = classify_file_custom(filepath, uqam_categories)
        
        target_dir = os.path.join(copies_folder, category)
        os.makedirs(target_dir, exist_ok=True)
        
        try:
            # COPIE DU FICHIER AU LIEU DE DÉPLACEMENT
            shutil.copy2(filepath, target_dir)
        except Exception as e:
            print(f"\nErreur lors de la copie de {filepath} : {e}")
            
    print("\n\nLe classement et la création des copies sont terminés !")

# --- Fonction principale ---

def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        folder_path = folder_path.replace('\\ ', ' ').strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates_found = find_duplicates(folder_path)
        process_duplicates_and_suggest_deletions(duplicates_found)

        print("\nPréparation pour l'étape de classement...")
        # Récupère tous les fichiers restants pour le classement
        all_files_in_folder = set()
        for dirpath, _, filenames in os.walk(folder_path):
            if "Classement (Copies)" in dirpath: # Ignore le dossier de destination
                continue
            for f in filenames:
                all_files_in_folder.add(os.path.join(dirpath, f))
        
        final_list_of_files = [f for f in all_files_in_folder if os.path.exists(f)]

        if final_list_of_files:
            classify_and_copy_files(folder_path, final_list_of_files)
        else:
            print("\nAucun fichier à classer.")

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). dc706.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

dcv3.py

Ce script Python gère les fichiers d'un répertoire en identifiant et supprimant les doublons, puis en organisant les fichiers restants par catégorie thématique à l'aide d'une IA.

Mots-clés: dédoublonnage, classement, fichiers, intelligence artificielle, organisation

import os
import sys
import hashlib
import time
import json
import requests # Utilise la bibliothèque standard pour les requêtes HTTP
from collections import defaultdict

# --- Fonctions de base (trouver les doublons) ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def find_duplicates(folder):
    """Trouve les fichiers en double et retourne un dictionnaire de doublons."""
    print(f"\nÉtape 1: Recherche des doublons dans : {folder}")
    hashes = defaultdict(list)
    total_files = sum(len(files) for _, _, files in os.walk(folder))
    scanned_files = 0

    if total_files == 0:
        print("\nDossier vide ou inaccessible.")
        return {}

    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            progress = (scanned_files / total_files) * 100
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nRecherche des doublons terminée.\n")
    return {h: p for h, p in hashes.items() if len(p) > 1}

# --- Fonctions de renommage intelligent ---

def get_readability_score(filename):
    """Attribue un score à un nom de fichier pour déterminer s'il est lisible."""
    score = len(filename)
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def get_best_filename(filepaths):
    """Choisit le meilleur nom de fichier parmi une liste de doublons."""
    # On trie d'abord par score de lisibilité, puis par longueur comme bris d'égalité
    return os.path.basename(max(filepaths, key=lambda p: (get_readability_score(os.path.basename(p)), len(p))))


def process_and_delete_duplicates(duplicates):
    """Gère le renommage intelligent et la suppression des doublons."""
    if not duplicates:
        print("Bonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return []

    print(f"Étape 2: Traitement de {len(duplicates)} groupes de doublons (renommage et suppression)")
    
    try:
        choice = input("Voulez-vous renommer intelligemment et supprimer les doublons ? (o/n) : ").lower()
        if choice != 'o':
            print("Opération annulée.")
            all_files = set()
            for paths in duplicates.values():
                all_files.update(paths)
            return list(all_files)
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return []

    files_kept = []
    for group_paths in duplicates.values():
        group_paths.sort()
        original_path = group_paths[0]
        original_dir = os.path.dirname(original_path)
        
        best_name = get_best_filename(group_paths)
        final_path = os.path.join(original_dir, best_name)

        if original_path != final_path:
            try:
                if os.path.exists(final_path):
                    print(f"  - AVERTISSEMENT: '{best_name}' existe déjà. Le renommage est ignoré.")
                    final_path = original_path
                else:
                    os.rename(original_path, final_path)
                    print(f"  - Renommé : '{os.path.basename(original_path)}' -> '{best_name}'")
            except OSError as e:
                print(f"  - ERREUR de renommage : {e}. Le nom original est conservé.")
                final_path = original_path

        files_kept.append(final_path)

        for path_to_delete in group_paths:
            if path_to_delete != original_path:
                try:
                    os.remove(path_to_delete)
                except OSError as e:
                    print(f"  - ERREUR de suppression de {path_to_delete}: {e}")
    
    print("\nNettoyage des doublons terminé.")
    return files_kept

# --- Fonctions de classement par IA ---

def classify_file_gemini(filepath, categories):
    """Utilise l'API Gemini pour deviner la catégorie d'un fichier."""
    try:
        filename = os.path.basename(filepath)
        prompt = f"Analyse ce nom de fichier '{filename}' et classe-le dans l'une de ces catégories : {categories}. Réponds uniquement avec le nom exact de la catégorie. Si incertain, réponds 'Autre'."
        
        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiKey = "" # Aucune clé nécessaire pour ce modèle dans cet environnement
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
        
        # Appel réseau standard avec la bibliothèque requests
        response = requests.post(apiUrl, json=payload, timeout=10) # Timeout de 10s
        response.raise_for_status() # Lève une erreur si la requête échoue (ex: 404, 500)
        
        result = response.json()
        
        category = result.get('candidates')[0]['content']['parts'][0]['text'].strip()
        # Valide que la catégorie retournée par l'IA est bien dans notre liste
        if category in categories:
            return category
        else:
            return "Autre"
            
    except requests.exceptions.RequestException as e:
        # print(f"\nErreur réseau pour {filename}: {e}")
        return "Autre"
    except (KeyError, IndexError, Exception) as e:
        # print(f"\nErreur de traitement de la réponse API pour {filename}: {e}")
        return "Autre"

def classify_and_move_files(base_folder, all_files):
    """Classe tous les fichiers restants et les déplace."""
    print("\nÉtape 3: Classement thématique des fichiers restants par IA")
    try:
        choice = input("Voulez-vous lancer le classement automatique ? (Cela peut prendre du temps) (o/n) : ").lower()
        if choice != 'o':
            print("Classement ignoré. Le programme est terminé.")
            return
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return

    categories = ["Factures", "Recherche", "Contrats", "Photos", "Personnel", "Travail", "Admin", "Projets", "Autre"]
    classified_folder = os.path.join(base_folder, "Fichiers Classés")
    os.makedirs(classified_folder, exist_ok=True)
    
    print(f"Les fichiers seront classés dans : {classified_folder}")
    print("Catégories utilisées :", ", ".join(categories))
    
    total_files_to_classify = len(all_files)
    for i, filepath in enumerate(all_files):
        if not os.path.exists(filepath):
            continue

        progress = ((i + 1) / total_files_to_classify) * 100
        sys.stdout.write(f"\rClassement: {progress:.1f}% ({i+1}/{total_files_to_classify}) - {os.path.basename(filepath)}")
        sys.stdout.flush()

        category = classify_file_gemini(filepath, categories)
        
        target_dir = os.path.join(classified_folder, category)
        os.makedirs(target_dir, exist_ok=True)
        
        try:
            target_path = os.path.join(target_dir, os.path.basename(filepath))
            if os.path.exists(target_path):
                base, ext = os.path.splitext(os.path.basename(filepath))
                target_path = os.path.join(target_dir, f"{base}_{int(time.time())}{ext}")
            
            os.rename(filepath, target_path)
        except OSError as e:
            print(f"\nErreur lors du déplacement de {filepath} : {e}")
            
    print("\nClassement thématique terminé !")

# --- Fonction principale ---

def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        folder_path = folder_path.replace('\\ ', ' ').strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates_found = find_duplicates(folder_path)
        files_kept = process_and_delete_duplicates(duplicates_found)

        # Récupère tous les fichiers restants pour le classement
        all_files_in_folder = set()
        for dirpath, _, filenames in os.walk(folder_path):
            if "Fichiers Classés" in dirpath:
                continue
            for f in filenames:
                all_files_in_folder.add(os.path.join(dirpath, f))
        
        final_list_of_files = list(all_files_in_folder)

        if final_list_of_files:
            classify_and_move_files(folder_path, final_list_of_files)
        else:
            print("\nAucun fichier à classer.")

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")

if __name__ == "__main__":
    main()

import os
import sys
import hashlib
import time
import json
import requests # Utilise la bibliothèque standard pour les requêtes HTTP
from collections import defaultdict

# --- Fonctions de base (trouver les doublons) ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def find_duplicates(folder):
    """Trouve les fichiers en double et retourne un dictionnaire de doublons."""
    print(f"\nÉtape 1: Recherche des doublons dans : {folder}")
    hashes = defaultdict(list)
    total_files = sum(len(files) for _, _, files in os.walk(folder))
    scanned_files = 0

    if total_files == 0:
        print("\nDossier vide ou inaccessible.")
        return {}

    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            progress = (scanned_files / total_files) * 100
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nRecherche des doublons terminée.\n")
    return {h: p for h, p in hashes.items() if len(p) > 1}

# --- Fonctions de renommage intelligent ---

def get_readability_score(filename):
    """Attribue un score à un nom de fichier pour déterminer s'il est lisible."""
    score = len(filename)
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def get_best_filename(filepaths):
    """Choisit le meilleur nom de fichier parmi une liste de doublons."""
    # On trie d'abord par score de lisibilité, puis par longueur comme bris d'égalité
    return os.path.basename(max(filepaths, key=lambda p: (get_readability_score(os.path.basename(p)), len(p))))


def process_and_delete_duplicates(duplicates):
    """Gère le renommage intelligent et la suppression des doublons."""
    if not duplicates:
        print("Bonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return []

    print(f"Étape 2: Traitement de {len(duplicates)} groupes de doublons (renommage et suppression)")
    
    try:
        choice = input("Voulez-vous renommer intelligemment et supprimer les doublons ? (o/n) : ").lower()
        if choice != 'o':
            print("Opération annulée.")
            all_files = set()
            for paths in duplicates.values():
                all_files.update(paths)
            return list(all_files)
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return []

    files_kept = []
    for group_paths in duplicates.values():
        group_paths.sort()
        original_path = group_paths[0]
        original_dir = os.path.dirname(original_path)
        
        best_name = get_best_filename(group_paths)
        final_path = os.path.join(original_dir, best_name)

        if original_path != final_path:
            try:
                if os.path.exists(final_path):
                    print(f"  - AVERTISSEMENT: '{best_name}' existe déjà. Le renommage est ignoré.")
                    final_path = original_path
                else:
                    os.rename(original_path, final_path)
                    print(f"  - Renommé : '{os.path.basename(original_path)}' -> '{best_name}'")
            except OSError as e:
                print(f"  - ERREUR de renommage : {e}. Le nom original est conservé.")
                final_path = original_path

        files_kept.append(final_path)

        for path_to_delete in group_paths:
            if path_to_delete != original_path:
                try:
                    os.remove(path_to_delete)
                except OSError as e:
                    print(f"  - ERREUR de suppression de {path_to_delete}: {e}")
    
    print("\nNettoyage des doublons terminé.")
    return files_kept

# --- Fonctions de classement par IA ---

def classify_file_gemini(filepath, categories):
    """Utilise l'API Gemini pour deviner la catégorie d'un fichier."""
    try:
        filename = os.path.basename(filepath)
        prompt = f"Analyse ce nom de fichier '{filename}' et classe-le dans l'une de ces catégories : {categories}. Réponds uniquement avec le nom exact de la catégorie. Si incertain, réponds 'Autre'."
        
        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiKey = "" # Aucune clé nécessaire pour ce modèle dans cet environnement
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
        
        # Appel réseau standard avec la bibliothèque requests
        response = requests.post(apiUrl, json=payload, timeout=10) # Timeout de 10s
        response.raise_for_status() # Lève une erreur si la requête échoue (ex: 404, 500)
        
        result = response.json()
        
        category = result.get('candidates')[0]['content']['parts'][0]['text'].strip()
        # Valide que la catégorie retournée par l'IA est bien dans notre liste
        if category in categories:
            return category
        else:
            return "Autre"
            
    except requests.exceptions.RequestException as e:
        # print(f"\nErreur réseau pour {filename}: {e}")
        return "Autre"
    except (KeyError, IndexError, Exception) as e:
        # print(f"\nErreur de traitement de la réponse API pour {filename}: {e}")
        return "Autre"

def classify_and_move_files(base_folder, all_files):
    """Classe tous les fichiers restants et les déplace."""
    print("\nÉtape 3: Classement thématique des fichiers restants par IA")
    try:
        choice = input("Voulez-vous lancer le classement automatique ? (Cela peut prendre du temps) (o/n) : ").lower()
        if choice != 'o':
            print("Classement ignoré. Le programme est terminé.")
            return
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return

    categories = ["Factures", "Recherche", "Contrats", "Photos", "Personnel", "Travail", "Admin", "Projets", "Autre"]
    classified_folder = os.path.join(base_folder, "Fichiers Classés")
    os.makedirs(classified_folder, exist_ok=True)
    
    print(f"Les fichiers seront classés dans : {classified_folder}")
    print("Catégories utilisées :", ", ".join(categories))
    
    total_files_to_classify = len(all_files)
    for i, filepath in enumerate(all_files):
        if not os.path.exists(filepath):
            continue

        progress = ((i + 1) / total_files_to_classify) * 100
        sys.stdout.write(f"\rClassement: {progress:.1f}% ({i+1}/{total_files_to_classify}) - {os.path.basename(filepath)}")
        sys.stdout.flush()

        category = classify_file_gemini(filepath, categories)
        
        target_dir = os.path.join(classified_folder, category)
        os.makedirs(target_dir, exist_ok=True)
        
        try:
            target_path = os.path.join(target_dir, os.path.basename(filepath))
            if os.path.exists(target_path):
                base, ext = os.path.splitext(os.path.basename(filepath))
                target_path = os.path.join(target_dir, f"{base}_{int(time.time())}{ext}")
            
            os.rename(filepath, target_path)
        except OSError as e:
            print(f"\nErreur lors du déplacement de {filepath} : {e}")
            
    print("\nClassement thématique terminé !")

# --- Fonction principale ---

def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        folder_path = folder_path.replace('\\ ', ' ').strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates_found = find_duplicates(folder_path)
        files_kept = process_and_delete_duplicates(duplicates_found)

        # Récupère tous les fichiers restants pour le classement
        all_files_in_folder = set()
        for dirpath, _, filenames in os.walk(folder_path):
            if "Fichiers Classés" in dirpath:
                continue
            for f in filenames:
                all_files_in_folder.add(os.path.join(dirpath, f))
        
        final_list_of_files = list(all_files_in_folder)

        if final_list_of_files:
            classify_and_move_files(folder_path, final_list_of_files)
        else:
            print("\nAucun fichier à classer.")

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). dcv3.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

Debate Analysis 1.ipynb

Erreur lors de la génération de la description.

Mots-clés: erreur, api

!pip install tweepy

# --- Nouvelle Cellule ---

# Import tweepy to work with the twitter API
import tweepy as tw

# Import numpy and pandas to work with dataframes
import numpy as np
import pandas as pd

# Import seaborn and matplotlib for viz
from matplotlib import pyplot as plt

# --- Nouvelle Cellule ---

consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''

# --- Nouvelle Cellule ---

# Authenticate
auth = tw.OAuthHandler(consumer_key, consumer_secret)
# Set Tokens
auth.set_access_token(access_token, access_token_secret)
# Instantiate API
api = tw.API(auth, wait_on_rate_limit=True)

# --- Nouvelle Cellule ---

hashtag = "#presidentialdebate"
query = tw.Cursor(api.search, q=hashtag).items(1000)
tweets = [{'Tweet':tweet.text, 'Timestamp':tweet.created_at} for tweet in query]
print(tweets)

# --- Nouvelle Cellule ---

df = pd.DataFrame.from_dict(tweets)
df.head()

# --- Nouvelle Cellule ---

trump_handle = ['DonaldTrump', 'Donald Trump', 'Donald', 'Trump', 'Trump\'s']
biden_handle = ['JoeBiden', 'Joe Biden', 'Joe', 'Biden', 'Biden\'s']

# --- Nouvelle Cellule ---

def identify_subject(tweet, refs):
    flag = 0 
    for ref in refs:
        if tweet.find(ref) != -1:
            flag = 1
    return flag

df['Trump'] = df['Tweet'].apply(lambda x: identify_subject(x, trump_handle)) 
df['Biden'] = df['Tweet'].apply(lambda x: identify_subject(x, biden_handle))
df.head(10)

# --- Nouvelle Cellule ---

# Import stopwords
import nltk
from nltk.corpus import stopwords

# Import textblob
from textblob import Word, TextBlob

# --- Nouvelle Cellule ---

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
custom_stopwords = ['RT', '#PresidentialDebate']

# --- Nouvelle Cellule ---

def preprocess_tweets(tweet, custom_stopwords):
    processed_tweet = tweet
    processed_tweet.replace('[^\w\s]', '')
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in stop_words)
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in custom_stopwords)
    processed_tweet = " ".join(Word(word).lemmatize() for word in processed_tweet.split())
    return(processed_tweet)

df['Processed Tweet'] = df['Tweet'].apply(lambda x: preprocess_tweets(x, custom_stopwords))
df.head()

# --- Nouvelle Cellule ---

print('Base review\n', df['Tweet'][0])
print('\n------------------------------------\n')
print('Cleaned and lemmatized review\n', df['Processed Tweet'][0])

# --- Nouvelle Cellule ---

# Calculate polarity
df['polarity'] = df['Processed Tweet'].apply(lambda x: TextBlob(x).sentiment[0])
df['subjectivity'] = df['Processed Tweet'].apply(lambda x: TextBlob(x).sentiment[1])
df[['Processed Tweet', 'Biden', 'Trump', 'polarity', 'subjectivity']].head()

# --- Nouvelle Cellule ---

display(df[df['Trump']==1][['Trump','polarity','subjectivity']].groupby('Trump').agg([np.mean, np.max, np.min, np.median]))
df[df['Biden']==1][['Biden','polarity','subjectivity']].groupby('Biden').agg([np.mean, np.max, np.min, np.median])

# --- Nouvelle Cellule ---

biden = df[df['Biden']==1][['Timestamp', 'polarity']]
biden = biden.sort_values(by='Timestamp', ascending=True)
biden['MA Polarity'] = biden.polarity.rolling(10, min_periods=3).mean()

trump = df[df['Trump']==1][['Timestamp', 'polarity']]
trump = trump.sort_values(by='Timestamp', ascending=True)
trump['MA Polarity'] = trump.polarity.rolling(10, min_periods=3).mean()

# --- Nouvelle Cellule ---

trump.head()

# --- Nouvelle Cellule ---

repub = 'red'
demo = 'blue'
fig, axes = plt.subplots(2, 1, figsize=(13, 10))

axes[0].plot(biden['Timestamp'], biden['MA Polarity'])
axes[0].set_title("\n".join(["Biden Polarity"]))
axes[1].plot(trump['Timestamp'], trump['MA Polarity'], color='red')
axes[1].set_title("\n".join(["Trump Polarity"]))

fig.suptitle("\n".join(["Presidential Debate Analysis"]), y=0.98)

plt.show()

# --- Nouvelle Cellule ---

Pour citer ce code :

Loyer, Dominique. (2024). Debate Analysis 1.ipynb [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

double_check.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import sys
import hashlib
from collections import defaultdict

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        # Les erreurs de lecture sont ignorées, le fichier ne sera pas inclus
        return None

def find_duplicates(folder):
    """
    Trouve les fichiers en double en scannant le dossier fourni.
    Retourne un dictionnaire de hashs pointant vers des listes de fichiers dupliqués.
    """
    print(f"\n1. Calcul des empreintes des fichiers dans : {folder}")
    hashes = defaultdict(list)
    total_files = 0
    scanned_files = 0

    # Compter les fichiers d'abord pour la barre de progression
    for dirpath, _, filenames in os.walk(folder):
        total_files += len(filenames)

    # Scanner et hasher les fichiers
    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            # Ignorer les liens symboliques et les fichiers de 0 octet
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            # Afficher la progression
            progress = (scanned_files / total_files) * 100
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nScan terminé.\n")
    
    # Retourner seulement les hashs qui ont plus d'un fichier (les vrais doublons)
    return {hash_val: paths for hash_val, paths in hashes.items() if len(paths) > 1}

def print_results(duplicates):
    """Affiche joliment les résultats."""
    if not duplicates:
        print("Bonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return

    print(f"2. {len(duplicates)} groupes de fichiers en double trouvés :\n")
    group_number = 1
    for file_paths in duplicates.values():
        print(f"--- Groupe {group_number} ---")
        # On trie pour avoir un ordre prévisible
        file_paths.sort()
        for i, path in enumerate(file_paths):
            if i == 0:
                print(f"  [ORIGINAL] {path}")
            else:
                print(f"  [DOUBLON]  {path}")
        print("-" * (len(str(group_number)) + 11))
        print() # Ligne vide pour la lisibilité
        group_number += 1

def delete_duplicates(duplicates):
    """Propose de supprimer les fichiers en double."""
    if not duplicates:
        return

    print("3. Action de suppression")
    print("------------------------")
    print("Pour chaque groupe, le fichier marqué [ORIGINAL] sera conservé.")
    print("Tous les fichiers marqués [DOUBLON] seront supprimés.")
    
    # Demander confirmation
    try:
        choice = input("Voulez-vous procéder à la suppression ? (o/n) : ").lower()
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return

    if choice == 'o':
        files_to_delete = []
        for file_paths in duplicates.values():
            # Garder le premier fichier (l'original), supprimer les autres
            files_to_delete.extend(file_paths[1:])
        
        if not files_to_delete:
            print("Aucun fichier à supprimer.")
            return

        print(f"\n{len(files_to_delete)} fichier(s) doublon(s) seront supprimé(s).")
        try:
            final_confirmation = input("Êtes-vous absolument sûr(e) ? Cette action est IRRÉVERSIBLE. (o/n) : ").lower()
        except (EOFError, KeyboardInterrupt):
            print("\nOpération annulée.")
            return

        if final_confirmation == 'o':
            deleted_count = 0
            for f in files_to_delete:
                try:
                    os.remove(f)
                    print(f"Supprimé : {f}")
                    deleted_count += 1
                except OSError as e:
                    print(f"ERREUR lors de la suppression de {f}: {e}")
            print(f"\nOpération terminée. {deleted_count} fichier(s) supprimé(s).")
        else:
            print("Suppression annulée.")
    else:
        print("Aucun fichier n'a été supprimé.")


if __name__ == "__main__":
    # Demander le chemin du dossier à l'utilisateur
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        
        # Sur Mac, glisser-déposer peut ajouter des barres obliques inverses pour les espaces.
        folder_path = folder_path.replace('\\ ', ' ').strip()
        # Enlever les guillemets si présents
        if folder_path.startswith("'") and folder_path.endswith("'"):
            folder_path = folder_path[1:-1]
        if folder_path.startswith('"') and folder_path.endswith('"'):
            folder_path = folder_path[1:-1]

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            sys.exit(1)

        duplicates_found = find_duplicates(folder_path)
        print_results(duplicates_found)
        delete_duplicates(duplicates_found)

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
        sys.exit(0)
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")
        sys.exit(1)

Pour citer ce code :

Loyer, Dominique. (2024). double_check.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

double_checkv2 1.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import sys
import hashlib
import time
import json
from collections import defaultdict

# --- Fonctions de base (trouver les doublons) ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def find_duplicates(folder):
    """Trouve les fichiers en double et retourne un dictionnaire de doublons."""
    print(f"\nÉtape 1: Recherche des doublons dans : {folder}")
    hashes = defaultdict(list)
    total_files = sum(len(files) for _, _, files in os.walk(folder))
    scanned_files = 0

    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            progress = (scanned_files / total_files) * 100 if total_files > 0 else 0
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nRecherche des doublons terminée.\n")
    return {h: p for h, p in hashes.items() if len(p) > 1}

# --- NOUVEAUTÉ : Fonctions de renommage intelligent ---

def get_readability_score(filename):
    """Attribue un score à un nom de fichier pour déterminer s'il est lisible."""
    score = len(filename)  # Les noms plus longs sont souvent meilleurs
    # Bonus pour les lettres, malus pour les noms uniquement numériques
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def get_best_filename(filepaths):
    """Choisit le meilleur nom de fichier parmi une liste de doublons."""
    best_score = -1
    best_name = ""
    for path in filepaths:
        filename = os.path.basename(path)
        score = get_readability_score(filename)
        if score > best_score:
            best_score = score
            best_name = filename
    return best_name

def process_and_delete_duplicates(duplicates):
    """Gère le renommage intelligent et la suppression des doublons."""
    if not duplicates:
        print("Bonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return []

    print(f"Étape 2: Traitement de {len(duplicates)} groupes de doublons (renommage et suppression)")
    
    try:
        choice = input("Voulez-vous renommer intelligemment et supprimer les doublons ? (o/n) : ").lower()
        if choice != 'o':
            print("Opération annulée.")
            # Retourne une liste de tous les fichiers pour un éventuel classement
            all_files = set()
            for paths in duplicates.values():
                all_files.update(paths)
            return list(all_files)
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return []

    files_kept = []
    for group_paths in duplicates.values():
        group_paths.sort()  # Ordre prévisible
        original_path = group_paths[0]
        original_dir = os.path.dirname(original_path)
        
        # Renommage intelligent
        best_name = get_best_filename(group_paths)
        final_path = os.path.join(original_dir, best_name)

        if original_path != final_path:
            try:
                # S'assure que la cible n'existe pas déjà (cas rare)
                if os.path.exists(final_path):
                    print(f"  - AVERTISSEMENT: Le nom de destination '{best_name}' existe déjà. Le renommage est ignoré pour ce groupe.")
                    final_path = original_path # On annule le renommage pour ce cas
                else:
                    os.rename(original_path, final_path)
                    print(f"  - Renommé : '{os.path.basename(original_path)}' -> '{best_name}'")
            except OSError as e:
                print(f"  - ERREUR de renommage : {e}. Le nom original sera conservé.")
                final_path = original_path

        files_kept.append(final_path)

        # Suppression des autres doublons
        for path_to_delete in group_paths:
            if path_to_delete != original_path:
                try:
                    os.remove(path_to_delete)
                    # print(f"  - Supprimé : {path_to_delete}")
                except OSError as e:
                    print(f"  - ERREUR de suppression de {path_to_delete}: {e}")
    
    print("\nNettoyage des doublons terminé.")
    return files_kept


# --- NOUVEAUTÉ : Fonctions de classement par IA ---

async def classify_file(filepath, categories):
    """Utilise une IA pour deviner la catégorie d'un fichier."""
    try:
        filename = os.path.basename(filepath)
        prompt = f"Analyse ce nom de fichier '{filename}' et classe-le dans l'une de ces catégories : {categories}. Réponds uniquement avec le nom de la catégorie. Si incertain, réponds 'Autre'."
        
        chatHistory = [{"role": "user", "parts": [{"text": prompt}]}]
        payload = {"contents": chatHistory}
        apiKey = "" # Aucune clé nécessaire pour ce modèle dans cet environnement
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
        
        response = await __fetch__(apiUrl, {
            'method': 'POST',
            'headers': {'Content-Type': 'application/json'},
            'body': json.dumps(payload)
        })
        
        result = await response.json()
        
        if result.get('candidates'):
            return result['candidates'][0]['content']['parts'][0]['text'].strip()
        else:
            return "Autre"
            
    except Exception as e:
        # print(f"Erreur API pour {filepath}: {e}")
        return "Autre"

async def classify_and_move_files(base_folder, all_files):
    """Classe tous les fichiers restants et les déplace dans des sous-dossiers thématiques."""
    print("\nÉtape 3: Classement thématique des fichiers restants par IA")
    try:
        choice = input("Voulez-vous lancer le classement automatique ? (Cela peut prendre du temps) (o/n) : ").lower()
        if choice != 'o':
            print("Classement ignoré. Le programme est terminé.")
            return
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return

    # Catégories personnalisables
    categories = ["Factures", "Recherche", "Contrats", "Photos", "Personnel", "Travail", "Admin", "Projets", "Autre"]
    
    # Crée un dossier "Fichiers Classés" pour ne pas mélanger
    classified_folder = os.path.join(base_folder, "Fichiers Classés")
    os.makedirs(classified_folder, exist_ok=True)
    
    print(f"Les fichiers seront classés dans : {classified_folder}")
    print("Catégories utilisées :", ", ".join(categories))
    
    total_files_to_classify = len(all_files)
    for i, filepath in enumerate(all_files):
        if not os.path.exists(filepath):
            continue

        progress = ((i + 1) / total_files_to_classify) * 100
        sys.stdout.write(f"\rClassement: {progress:.1f}% ({i+1}/{total_files_to_classify}) - {os.path.basename(filepath)}")
        sys.stdout.flush()

        category = await classify_file(filepath, categories)
        
        # Crée le dossier de la catégorie s'il n'existe pas
        target_dir = os.path.join(classified_folder, category)
        os.makedirs(target_dir, exist_ok=True)
        
        # Déplace le fichier
        try:
            target_path = os.path.join(target_dir, os.path.basename(filepath))
            # Gère le cas où un fichier du même nom existe déjà dans la destination
            if os.path.exists(target_path):
                base, ext = os.path.splitext(os.path.basename(filepath))
                target_path = os.path.join(target_dir, f"{base}_{int(time.time())}{ext}")
            
            os.rename(filepath, target_path)
        except OSError as e:
            print(f"\nErreur lors du déplacement de {filepath} : {e}")
            
    print("\nClassement thématique terminé !")


# --- Fonction principale ---

async def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        folder_path = folder_path.replace('\\ ', ' ').strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates_found = find_duplicates(folder_path)
        files_to_classify = process_and_delete_duplicates(duplicates_found)

        # Ajouter les fichiers non-dupliqués à la liste pour le classement
        all_files_in_folder = set()
        for dirpath, _, filenames in os.walk(folder_path):
             # On ne scanne que le dossier de base après le nettoyage, pas les nouveaux sous-dossiers
            if "Fichiers Classés" in dirpath:
                continue
            for f in filenames:
                all_files_in_folder.add(os.path.join(dirpath, f))
        
        # Combine les fichiers uniques et ceux gardés après dé-duplication
        final_list_of_files = list(set(files_to_classify) | all_files_in_folder)
        final_list_of_files = [f for f in final_list_of_files if os.path.exists(f)]


        if final_list_of_files:
            await classify_and_move_files(folder_path, final_list_of_files)
        else:
            print("\nAucun fichier à classer.")

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")

if __name__ == "__main__":
    import asyncio
    # Cette instruction est spécifique à l'environnement d'exécution pour permettre les appels réseau.
    from js import fetch as __fetch__ 
    asyncio.run(main())

Pour citer ce code :

Loyer, Dominique. (2024). double_checkv2 1.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

double_checkv2.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import sys
import hashlib
import time
import json
from collections import defaultdict

# --- Fonctions de base (trouver les doublons) ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def find_duplicates(folder):
    """Trouve les fichiers en double et retourne un dictionnaire de doublons."""
    print(f"\nÉtape 1: Recherche des doublons dans : {folder}")
    hashes = defaultdict(list)
    total_files = sum(len(files) for _, _, files in os.walk(folder))
    scanned_files = 0

    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            progress = (scanned_files / total_files) * 100 if total_files > 0 else 0
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nRecherche des doublons terminée.\n")
    return {h: p for h, p in hashes.items() if len(p) > 1}

# --- NOUVEAUTÉ : Fonctions de renommage intelligent ---

def get_readability_score(filename):
    """Attribue un score à un nom de fichier pour déterminer s'il est lisible."""
    score = len(filename)  # Les noms plus longs sont souvent meilleurs
    # Bonus pour les lettres, malus pour les noms uniquement numériques
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def get_best_filename(filepaths):
    """Choisit le meilleur nom de fichier parmi une liste de doublons."""
    best_score = -1
    best_name = ""
    for path in filepaths:
        filename = os.path.basename(path)
        score = get_readability_score(filename)
        if score > best_score:
            best_score = score
            best_name = filename
    return best_name

def process_and_delete_duplicates(duplicates):
    """Gère le renommage intelligent et la suppression des doublons."""
    if not duplicates:
        print("Bonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return []

    print(f"Étape 2: Traitement de {len(duplicates)} groupes de doublons (renommage et suppression)")
    
    try:
        choice = input("Voulez-vous renommer intelligemment et supprimer les doublons ? (o/n) : ").lower()
        if choice != 'o':
            print("Opération annulée.")
            # Retourne une liste de tous les fichiers pour un éventuel classement
            all_files = set()
            for paths in duplicates.values():
                all_files.update(paths)
            return list(all_files)
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return []

    files_kept = []
    for group_paths in duplicates.values():
        group_paths.sort()  # Ordre prévisible
        original_path = group_paths[0]
        original_dir = os.path.dirname(original_path)
        
        # Renommage intelligent
        best_name = get_best_filename(group_paths)
        final_path = os.path.join(original_dir, best_name)

        if original_path != final_path:
            try:
                # S'assure que la cible n'existe pas déjà (cas rare)
                if os.path.exists(final_path):
                    print(f"  - AVERTISSEMENT: Le nom de destination '{best_name}' existe déjà. Le renommage est ignoré pour ce groupe.")
                    final_path = original_path # On annule le renommage pour ce cas
                else:
                    os.rename(original_path, final_path)
                    print(f"  - Renommé : '{os.path.basename(original_path)}' -> '{best_name}'")
            except OSError as e:
                print(f"  - ERREUR de renommage : {e}. Le nom original sera conservé.")
                final_path = original_path

        files_kept.append(final_path)

        # Suppression des autres doublons
        for path_to_delete in group_paths:
            if path_to_delete != original_path:
                try:
                    os.remove(path_to_delete)
                    # print(f"  - Supprimé : {path_to_delete}")
                except OSError as e:
                    print(f"  - ERREUR de suppression de {path_to_delete}: {e}")
    
    print("\nNettoyage des doublons terminé.")
    return files_kept


# --- NOUVEAUTÉ : Fonctions de classement par IA ---

async def classify_file(filepath, categories):
    """Utilise une IA pour deviner la catégorie d'un fichier."""
    try:
        filename = os.path.basename(filepath)
        prompt = f"Analyse ce nom de fichier '{filename}' et classe-le dans l'une de ces catégories : {categories}. Réponds uniquement avec le nom de la catégorie. Si incertain, réponds 'Autre'."
        
        chatHistory = [{"role": "user", "parts": [{"text": prompt}]}]
        payload = {"contents": chatHistory}
        apiKey = "" # Aucune clé nécessaire pour ce modèle dans cet environnement
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
        
        response = await __fetch__(apiUrl, {
            'method': 'POST',
            'headers': {'Content-Type': 'application/json'},
            'body': json.dumps(payload)
        })
        
        result = await response.json()
        
        if result.get('candidates'):
            return result['candidates'][0]['content']['parts'][0]['text'].strip()
        else:
            return "Autre"
            
    except Exception as e:
        # print(f"Erreur API pour {filepath}: {e}")
        return "Autre"

async def classify_and_move_files(base_folder, all_files):
    """Classe tous les fichiers restants et les déplace dans des sous-dossiers thématiques."""
    print("\nÉtape 3: Classement thématique des fichiers restants par IA")
    try:
        choice = input("Voulez-vous lancer le classement automatique ? (Cela peut prendre du temps) (o/n) : ").lower()
        if choice != 'o':
            print("Classement ignoré. Le programme est terminé.")
            return
    except (EOFError, KeyboardInterrupt):
        print("\nOpération annulée.")
        return

    # Catégories personnalisables
    categories = ["Factures", "Recherche", "Contrats", "Photos", "Personnel", "Travail", "Admin", "Projets", "Autre"]
    
    # Crée un dossier "Fichiers Classés" pour ne pas mélanger
    classified_folder = os.path.join(base_folder, "Fichiers Classés")
    os.makedirs(classified_folder, exist_ok=True)
    
    print(f"Les fichiers seront classés dans : {classified_folder}")
    print("Catégories utilisées :", ", ".join(categories))
    
    total_files_to_classify = len(all_files)
    for i, filepath in enumerate(all_files):
        if not os.path.exists(filepath):
            continue

        progress = ((i + 1) / total_files_to_classify) * 100
        sys.stdout.write(f"\rClassement: {progress:.1f}% ({i+1}/{total_files_to_classify}) - {os.path.basename(filepath)}")
        sys.stdout.flush()

        category = await classify_file(filepath, categories)
        
        # Crée le dossier de la catégorie s'il n'existe pas
        target_dir = os.path.join(classified_folder, category)
        os.makedirs(target_dir, exist_ok=True)
        
        # Déplace le fichier
        try:
            target_path = os.path.join(target_dir, os.path.basename(filepath))
            # Gère le cas où un fichier du même nom existe déjà dans la destination
            if os.path.exists(target_path):
                base, ext = os.path.splitext(os.path.basename(filepath))
                target_path = os.path.join(target_dir, f"{base}_{int(time.time())}{ext}")
            
            os.rename(filepath, target_path)
        except OSError as e:
            print(f"\nErreur lors du déplacement de {filepath} : {e}")
            
    print("\nClassement thématique terminé !")


# --- Fonction principale ---

async def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        folder_path = folder_path.replace('\\ ', ' ').strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates_found = find_duplicates(folder_path)
        files_to_classify = process_and_delete_duplicates(duplicates_found)

        # Ajouter les fichiers non-dupliqués à la liste pour le classement
        all_files_in_folder = set()
        for dirpath, _, filenames in os.walk(folder_path):
             # On ne scanne que le dossier de base après le nettoyage, pas les nouveaux sous-dossiers
            if "Fichiers Classés" in dirpath:
                continue
            for f in filenames:
                all_files_in_folder.add(os.path.join(dirpath, f))
        
        # Combine les fichiers uniques et ceux gardés après dé-duplication
        final_list_of_files = list(set(files_to_classify) | all_files_in_folder)
        final_list_of_files = [f for f in final_list_of_files if os.path.exists(f)]


        if final_list_of_files:
            await classify_and_move_files(folder_path, final_list_of_files)
        else:
            print("\nAucun fichier à classer.")

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")

if __name__ == "__main__":
    import asyncio
    # Cette instruction est spécifique à l'environnement d'exécution pour permettre les appels réseau.
    from js import fetch as __fetch__ 
    asyncio.run(main())

Pour citer ce code :

Loyer, Dominique. (2024). double_checkv2.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

double.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import hashlib
import tkinter as tk
from tkinter import filedialog, ttk, messagebox

# --- Fonctions principales ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            for block in iter(lambda: f.read(block_size), b''):
                sha256.update(block)
        return sha256.hexdigest()
    except IOError as e:
        print(f"Erreur de lecture du fichier : {filepath} ({e})")
        return None

def find_duplicate_files(folder_path, progress_callback):
    """
    Trouve les fichiers en double dans un dossier et ses sous-dossiers.
    Retourne un dictionnaire où les clés sont les hashs et les valeurs sont les listes de chemins de fichiers identiques.
    """
    hashes = {}
    duplicates = {}
    file_count = 0
    scanned_count = 0

    for dirpath, _, filenames in os.walk(folder_path):
        file_count += len(filenames)

    for dirpath, _, filenames in os.walk(folder_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath):
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes.setdefault(file_hash, []).append(filepath)
            scanned_count += 1
            progress_callback(scanned_count, file_count)

    for file_hash, filepaths in hashes.items():
        if len(filepaths) > 1:
            duplicates[file_hash] = filepaths
            
    return duplicates

def delete_files(files_to_delete):
    """Supprime les fichiers sélectionnés."""
    for file_path in files_to_delete:
        try:
            os.remove(file_path)
            print(f"Fichier supprimé : {file_path}")
        except OSError as e:
            print(f"Erreur lors de la suppression du fichier {file_path}: {e}")

# --- Interface graphique (GUI) ---

class DuplicateFinderApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Chercheur de Fichiers Doublons (v2)")
        self.geometry("800x600")
        self.configure(bg="#f0f0f0")
        self.duplicates = {}
        self.create_widgets()

    def create_widgets(self):
        top_frame = tk.Frame(self, padx=10, pady=10, bg="#f0f0f0")
        top_frame.pack(fill=tk.X)

        tk.Label(top_frame, text="Dossier à scanner :", bg="#f0f0f0").pack(side=tk.LEFT, padx=(0, 5))
        
        # MODIFICATION : Utilisation directe de l'Entry widget
        self.folder_entry = tk.Entry(top_frame, width=60)
        self.folder_entry.pack(side=tk.LEFT, expand=True, fill=tk.X)

        browse_button = ttk.Button(top_frame, text="Parcourir...", command=self.browse_folder)
        browse_button.pack(side=tk.LEFT, padx=(5, 0))

        scan_button = ttk.Button(self, text="Scanner les Fichiers", command=self.start_scan)
        scan_button.pack(pady=10)

        self.progress = ttk.Progressbar(self, orient="horizontal", length=400, mode="determinate")
        self.progress.pack(pady=5)
        self.progress_label = tk.Label(self, text="", bg="#f0f0f0")
        self.progress_label.pack()

        result_frame = tk.Frame(self, bg="white", bd=1, relief=tk.SUNKEN)
        result_frame.pack(expand=True, fill=tk.BOTH, padx=10, pady=5)
        
        self.tree = ttk.Treeview(result_frame, columns=("path",), show="headings")
        self.tree.heading("path", text="Chemin du Fichier")
        self.tree.pack(side=tk.LEFT, expand=True, fill=tk.BOTH)
        
        scrollbar = ttk.Scrollbar(result_frame, orient="vertical", command=self.tree.yview)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.tree.configure(yscrollcommand=scrollbar.set)

        action_frame = tk.Frame(self, padx=10, pady=10, bg="#f0f0f0")
        action_frame.pack(fill=tk.X)

        delete_button = ttk.Button(action_frame, text="Supprimer la sélection (conserver le premier)", command=self.delete_selected)
        delete_button.pack(side=tk.RIGHT)

    def browse_folder(self):
        """Ouvre une boîte de dialogue pour choisir un dossier."""
        directory = filedialog.askdirectory(title="Sélectionnez un dossier")
        
        # MODIFICATION : Ligne de débogage pour voir si le chemin est capturé
        print(f"Terminal debug: Dossier sélectionné -> '{directory}'")
        
        if directory:
            self.folder_entry.delete(0, tk.END)
            self.folder_entry.insert(0, directory)

    def update_progress(self, current, total):
        self.progress['value'] = (current / total) * 100
        self.progress_label['text'] = f"{current} / {total} fichiers scannés"
        self.update_idletasks()

    def start_scan(self):
        # MODIFICATION : Obtenir le chemin directement depuis l'Entry widget
        path = self.folder_entry.get()
        if not path:
            messagebox.showwarning("Avertissement", "Veuillez sélectionner un dossier avant de scanner.")
            return

        for i in self.tree.get_children():
            self.tree.delete(i)
        
        self.progress['value'] = 0
        self.progress_label['text'] = "Scan en cours..."
        self.update()

        try:
            self.duplicates = find_duplicate_files(path, self.update_progress)
            self.display_duplicates()
        except Exception as e:
            messagebox.showerror("Erreur", f"Une erreur est survenue : {e}")

    def display_duplicates(self):
        if not self.duplicates:
            messagebox.showinfo("Résultats", "Aucun fichier en double n'a été trouvé !")
            self.progress_label['text'] = "Scan terminé. Aucun doublon."
            return

        self.progress_label['text'] = f"Scan terminé. {len(self.duplicates)} groupes de doublons trouvés."
        group_count = 1
        for file_hash, filepaths in self.duplicates.items():
            group_id = self.tree.insert("", "end", values=(f"--- Groupe de doublons {group_count} ({len(filepaths)} fichiers) ---",))
            self.tree.item(group_id, tags=('group_header',))
            for i, path in enumerate(filepaths):
                tag = 'original' if i == 0 else 'duplicate'
                self.tree.insert(group_id, "end", values=(path,), tags=(tag, file_hash))
            group_count += 1
        
        self.tree.tag_configure('group_header', background='#e0e0e0', font=('Helvetica', 10, 'bold'))
        self.tree.tag_configure('original', foreground='green')
        self.tree.tag_configure('duplicate', foreground='red')

    def delete_selected(self):
        if not self.duplicates:
            messagebox.showinfo("Information", "Aucun doublon à supprimer.")
            return

        if not messagebox.askyesno("Confirmation", 
                                   "Voulez-vous vraiment supprimer tous les fichiers marqués en rouge ?\n"
                                   "Cette action est irréversible. Seul le premier fichier de chaque groupe (en vert) sera conservé."):
            return

        files_to_remove = []
        for file_hash, filepaths in self.duplicates.items():
            files_to_remove.extend(filepaths[1:])

        delete_files(files_to_remove)
        messagebox.showinfo("Opération terminée", f"{len(files_to_remove)} fichiers doublons ont été supprimés.")
        self.start_scan()


if __name__ == "__main__":
    app = DuplicateFinderApp()
    app.mainloop()

```

### Instructions

1.  **Remplacez** le code dans votre fichier `doublon.py` par ce nouveau code.
2.  **Lancez le script** depuis le Terminal comme avant (`python3 doublon.py`).
3.  Gardez un œil sur la fenêtre du **Terminal** en arrière-plan.
4.  Dans l'application, cliquez sur **"Parcourir..."**, sélectionnez un dossier et cliquez sur **"Ouvrir"**.
5.  **Regardez immédiatement votre fenêtre Terminal.**

Vous devriez voir une nouvelle ligne apparaître, qui ressemble à ceci :
`Terminal debug: Dossier sélectionné -> '/Users/votrenom/Documents'`

**Scénario 1 : Le chemin s'affiche dans le Terminal ET dans l'application**
C'est parfait ! Le problème est résolu. Vous pouvez maintenant lancer le scan.

**Scénario 2 : Le chemin s'affiche dans le Terminal mais PAS dans l'application**
C'est peu probable, mais si cela arrive, c'est un bug d'affichage de Tkinter.

**Scénario 3 : Le chemin NE s'affiche PAS dans le Terminal (il affiche des guillemets vides : `''`)**
C'est le cas le plus probable. Cela confirme que le problème vient des **autorisations de macOS**. Le système d'exploitation bloque le script et ne lui transmet pas le nom du dossier que vous avez choisi.

Si c'est le cas (Scénario 3), voici comment le résoudre :
Allez dans **Réglages Système** > **Confidentialité et sécurité** > **Fichiers et dossiers**. Cherchez l'application **Terminal** dans la liste et assurez-vous qu'elle a l'autorisation d'accéder à votre dossier "Documents", "Téléchargements" ou là où se trouvent vos fichiers.

Tenez-moi au courant du résultat ! La ligne de debug dans le Terminal nous donnera la c

Pour citer ce code :

Loyer, Dominique. (2024). double.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

doublecheck.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import hashlib
import tkinter as tk
from tkinter import filedialog, ttk, messagebox

# --- Fonctions principales ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            for block in iter(lambda: f.read(block_size), b''):
                sha256.update(block)
        return sha256.hexdigest()
    except IOError as e:
        print(f"Erreur de lecture du fichier : {filepath} ({e})")
        return None

def find_duplicate_files(folder_path, progress_callback):
    """
    Trouve les fichiers en double dans un dossier et ses sous-dossiers.
    Retourne un dictionnaire où les clés sont les hashs et les valeurs sont les listes de chemins de fichiers identiques.
    """
    hashes = {}
    duplicates = {}
    file_count = 0
    scanned_count = 0

    try:
        for dirpath, _, filenames in os.walk(folder_path):
            file_count += len(filenames)

        if file_count == 0:
            progress_callback(0, 0) # Gérer le cas du dossier vide
            return {}

        for dirpath, _, filenames in os.walk(folder_path):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                if not os.path.islink(filepath):
                    file_hash = calculate_hash(filepath)
                    if file_hash:
                        hashes.setdefault(file_hash, []).append(filepath)
                scanned_count += 1
                progress_callback(scanned_count, file_count)

    except FileNotFoundError:
        messagebox.showerror("Erreur", f"Le dossier '{folder_path}' n'a pas été trouvé.")
        return {}
    except PermissionError:
        messagebox.showerror("Erreur", f"Permission refusée pour accéder au dossier '{folder_path}'.\nVeuillez vérifier les autorisations de sécurité de macOS.")
        return {}


    for file_hash, filepaths in hashes.items():
        if len(filepaths) > 1:
            duplicates[file_hash] = filepaths
            
    return duplicates

def delete_files(files_to_delete):
    """Supprime les fichiers sélectionnés."""
    for file_path in files_to_delete:
        try:
            os.remove(file_path)
            print(f"Fichier supprimé : {file_path}")
        except OSError as e:
            print(f"Erreur lors de la suppression du fichier {file_path}: {e}")

# --- Interface graphique (GUI) ---

class DuplicateFinderApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Chercheur de Fichiers Doublons (v2 - corrigé)")
        self.geometry("800x600")
        self.configure(bg="#f0f0f0")
        self.duplicates = {}
        self.create_widgets()

    def create_widgets(self):
        top_frame = tk.Frame(self, padx=10, pady=10, bg="#f0f0f0")
        top_frame.pack(fill=tk.X)

        tk.Label(top_frame, text="Dossier à scanner :", bg="#f0f0f0").pack(side=tk.LEFT, padx=(0, 5))
        
        self.folder_entry = tk.Entry(top_frame, width=60)
        self.folder_entry.pack(side=tk.LEFT, expand=True, fill=tk.X)

        browse_button = ttk.Button(top_frame, text="Parcourir...", command=self.browse_folder)
        browse_button.pack(side=tk.LEFT, padx=(5, 0))

        scan_button = ttk.Button(self, text="Scanner les Fichiers", command=self.start_scan)
        scan_button.pack(pady=10)

        self.progress = ttk.Progressbar(self, orient="horizontal", length=400, mode="determinate")
        self.progress.pack(pady=5)
        self.progress_label = tk.Label(self, text="", bg="#f0f0f0")
        self.progress_label.pack()

        result_frame = tk.Frame(self, bg="white", bd=1, relief=tk.SUNKEN)
        result_frame.pack(expand=True, fill=tk.BOTH, padx=10, pady=5)
        
        self.tree = ttk.Treeview(result_frame, columns=("path",), show="headings")
        self.tree.heading("path", text="Chemin du Fichier")
        self.tree.pack(side=tk.LEFT, expand=True, fill=tk.BOTH)
        
        scrollbar = ttk.Scrollbar(result_frame, orient="vertical", command=self.tree.yview)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.tree.configure(yscrollcommand=scrollbar.set)

        action_frame = tk.Frame(self, padx=10, pady=10, bg="#f0f0f0")
        action_frame.pack(fill=tk.X)

        delete_button = ttk.Button(action_frame, text="Supprimer la sélection (conserver le premier)", command=self.delete_selected)
        delete_button.pack(side=tk.RIGHT)

    def browse_folder(self):
        directory = filedialog.askdirectory(title="Sélectionnez un dossier")
        
        print(f"Terminal debug: Dossier sélectionné -> '{directory}'")
        
        if directory:
            self.folder_entry.delete(0, tk.END)
            self.folder_entry.insert(0, directory)

    def update_progress(self, current, total):
        if total > 0:
            self.progress['value'] = (current / total) * 100
        else:
            self.progress['value'] = 100
        self.progress_label['text'] = f"{current} / {total} fichiers scannés"
        self.update_idletasks()

    def start_scan(self):
        path = self.folder_entry.get()
        if not path:
            messagebox.showwarning("Avertissement", "Veuillez sélectionner un dossier avant de scanner.")
            return

        for i in self.tree.get_children():
            self.tree.delete(i)
        
        self.progress['value'] = 0
        self.progress_label['text'] = "Scan en cours..."
        self.update()

        try:
            self.duplicates = find_duplicate_files(path, self.update_progress)
            self.display_duplicates()
        except Exception as e:
            messagebox.showerror("Erreur", f"Une erreur est survenue pendant le scan: {e}")

    def display_duplicates(self):
        if not self.duplicates:
            messagebox.showinfo("Résultats", "Aucun fichier en double n'a été trouvé !")
            self.progress_label['text'] = "Scan terminé. Aucun doublon."
            return

        self.progress_label['text'] = f"Scan terminé. {len(self.duplicates)} groupes de doublons trouvés."
        group_count = 1
        for file_hash, filepaths in self.duplicates.items():
            group_id = self.tree.insert("", "end", values=(f"--- Groupe de doublons {group_count} ({len(filepaths)} fichiers) ---",))
            self.tree.item(group_id, tags=('group_header',))
            filepaths.sort() # Trier pour avoir un ordre consistent
            for i, path in enumerate(filepaths):
                tag = 'original' if i == 0 else 'duplicate'
                self.tree.insert(group_id, "end", values=(path,), tags=(tag, file_hash))
            group_count += 1
        
        self.tree.tag_configure('group_header', background='#e0e0e0', font=('Helvetica', 10, 'bold'))
        self.tree.tag_configure('original', foreground='green')
        self.tree.tag_configure('duplicate', foreground='red')

    def delete_selected(self):
        if not self.duplicates:
            messagebox.showinfo("Information", "Aucun doublon à supprimer.")
            return

        if not messagebox.askyesno("Confirmation", 
                                   "Voulez-vous vraiment supprimer tous les fichiers marqués en rouge ?\n"
                                   "Cette action est irréversible. Seul le premier fichier de chaque groupe (en vert) sera conservé."):
            return

        files_to_remove = []
        for file_hash, filepaths in self.duplicates.items():
            files_to_remove.extend(filepaths[1:])

        delete_files(files_to_remove)
        messagebox.showinfo("Opération terminée", f"{len(files_to_remove)} fichiers doublons ont été supprimés.")
        
        # Rafraîchir l'affichage
        self.start_scan()


if __name__ == "__main__":
    app = DuplicateFinderApp()
    app.mainloop()

Pour citer ce code :

Loyer, Dominique. (2024). doublecheck.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

doublon.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import hashlib
import tkinter as tk
from tkinter import filedialog, ttk, messagebox

# --- Fonctions principales ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            # Lire le fichier par blocs pour ne pas surcharger la mémoire
            for block in iter(lambda: f.read(block_size), b''):
                sha256.update(block)
        return sha256.hexdigest()
    except IOError as e:
        # Gère les erreurs si un fichier ne peut pas être lu
        print(f"Erreur de lecture du fichier : {filepath} ({e})")
        return None

def find_duplicate_files(folder_path, progress_callback):
    """
    Trouve les fichiers en double dans un dossier et ses sous-dossiers.
    Retourne un dictionnaire où les clés sont les hash et les valeurs sont les listes de chemins de fichiers identiques.
    """
    hashes = {}
    duplicates = {}
    file_count = 0
    scanned_count = 0

    # Première passe : compter le nombre total de fichiers
    for dirpath, _, filenames in os.walk(folder_path):
        file_count += len(filenames)

    # Deuxième passe : scanner les fichiers et trouver les doublons
    for dirpath, _, filenames in os.walk(folder_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            
            # Ignore les liens symboliques pour éviter les erreurs
            if not os.path.islink(filepath):
                file_hash = calculate_hash(filepath)
                if file_hash:
                    # Ajoute le fichier à la liste des fichiers ayant ce hash
                    hashes.setdefault(file_hash, []).append(filepath)

            scanned_count += 1
            # Call the progress callback only if it's provided
            if progress_callback:
                progress_callback(scanned_count, file_count)


    # Filtre pour ne garder que les hashs avec plus d'un fichier (les doublons)
    for file_hash, filepaths in hashes.items():
        if len(filepaths) > 1:
            duplicates[file_hash] = filepaths
            
    return duplicates

def delete_files(files_to_delete):
    """Supprime les fichiers sélectionnés."""
    for file_path in files_to_delete:
        try:
            os.remove(file_path)
            print(f"Fichier supprimé : {file_path}")
        except OSError as e:
            print(f"Erreur lors de la suppression du fichier {file_path}: {e}")

# --- Interface graphique (GUI) ---

class DuplicateFinderApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Chercheur de Fichiers Doublons")
        self.geometry("800x600")
        self.configure(bg="#f0f0f0")

        # --- Variables ---
        self.folder_path = tk.StringVar()
        self.duplicates = {}

        # --- Widgets ---
        self.create_widgets()

    def create_widgets(self):
        # Frame pour la sélection du dossier
        top_frame = tk.Frame(self, padx=10, pady=10, bg="#f0f0f0")
        top_frame.pack(fill=tk.X)

        tk.Label(top_frame, text="Dossier à scanner :", bg="#f0f0f0").pack(side=tk.LEFT, padx=(0, 5))
        
        entry = tk.Entry(top_frame, textvariable=self.folder_path, width=60)
        entry.pack(side=tk.LEFT, expand=True, fill=tk.X)

        browse_button = ttk.Button(top_frame, text="Parcourir...", command=self.browse_folder)
        browse_button.pack(side=tk.LEFT, padx=(5, 0))

        # Bouton pour démarrer le scan
        scan_button = ttk.Button(self, text="Scanner les Fichiers", command=self.start_scan)
        scan_button.pack(pady=10)

        # Barre de progression
        self.progress = ttk.Progressbar(self, orient="horizontal", length=400, mode="determinate")
        self.progress.pack(pady=5)
        self.progress_label = tk.Label(self, text="", bg="#f0f0f0")
        self.progress_label.pack()

        # Frame pour les résultats
        result_frame = tk.Frame(self, bg="white", bd=1, relief=tk.SUNKEN)
        result_frame.pack(expand=True, fill=tk.BOTH, padx=10, pady=5)
        
        # Arbre pour afficher les résultats
        self.tree = ttk.Treeview(result_frame, columns=("path",), show="headings")
        self.tree.heading("path", text="Chemin du Fichier")
        self.tree.pack(side=tk.LEFT, expand=True, fill=tk.BOTH)
        
        # Scrollbar
        scrollbar = ttk.Scrollbar(result_frame, orient="vertical", command=self.tree.yview)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.tree.configure(yscrollcommand=scrollbar.set)

        # Frame pour les boutons d'action
        action_frame = tk.Frame(self, padx=10, pady=10, bg="#f0f0f0")
        action_frame.pack(fill=tk.X)

        delete_button = ttk.Button(action_frame, text="Supprimer la sélection (conserver le premier)", command=self.delete_selected)
        delete_button.pack(side=tk.RIGHT)

    def browse_folder(self):
        """Ouvre une boîte de dialogue pour choisir un dossier."""
        directory = filedialog.askdirectory(title="Sélectionnez un dossier")
        if directory:
            self.folder_path.set(directory)

    def update_progress(self, current, total):
        """Met à jour la barre de progression."""
        # Ensure self.progress exists before trying to update it
        if hasattr(self, 'progress'):
            self.progress['value'] = (current / total) * 100
            self.progress_label['text'] = f"{current} / {total} fichiers scannés"
            self.update_idletasks() # Met à jour l'interface

    def start_scan(self):
        """Lance le processus de scan."""
        path = self.folder_path.get()
        if not path:
            messagebox.showwarning("Avertissement", "Veuillez sélectionner un dossier avant de scanner.")
            return

        # Nettoyer l'affichage précédent
        for i in self.tree.get_children():
            self.tree.delete(i)
        
        # Ensure self.progress exists before trying to reset it
        if hasattr(self, 'progress'):
            self.progress['value'] = 0
            self.progress_label['text'] = "Scan en cours..."
            self.update()

        try:
            # Pass the update_progress method if the GUI is running
            progress_func = self.update_progress if hasattr(self, 'progress') else None
            self.duplicates = find_duplicate_files(path, progress_func)
            self.display_duplicates()
        except Exception as e:
            messagebox.showerror("Erreur", f"Une erreur est survenue : {e}")


    def display_duplicates(self):
        """Affiche les doublons trouvés dans l'arbre."""
        # Only attempt to display if the GUI is running
        if not hasattr(self, 'tree'):
             if self.duplicates:
                 print("Doublons trouvés (GUI non affichée):")
                 for file_hash, filepaths in self.duplicates.items():
                     print(f"  Hash: {file_hash}")
                     for path in filepaths:
                         print(f"    - {path}")
             else:
                 print("Aucun fichier en double n'a été trouvé.")
             return


        if not self.duplicates:
            messagebox.showinfo("Résultats", "Aucun fichier en double n'a été trouvé !")
            self.progress_label['text'] = "Scan terminé. Aucun doublon."
            return

        self.progress_label['text'] = f"Scan terminé. {len(self.duplicates)} groupes de doublons trouvés."

        group_count = 1
        for file_hash, filepaths in self.duplicates.items():
            # Ajoute un en-tête pour chaque groupe de doublons
            group_id = self.tree.insert("", "end", text="", values=(f"--- Groupe de doublons {group_count} ({len(filepaths)} fichiers) ---",))
            self.tree.item(group_id, tags=('group_header',))

            for i, path in enumerate(filepaths):
                # Le premier fichier est marqué comme "original"
                tag = 'original' if i == 0 else 'duplicate'
                self.tree.insert(group_id, "end", values=(path,), tags=(tag, file_hash))
            
            group_count += 1
        
        # Appliquer des couleurs pour une meilleure lisibilité
        self.tree.tag_configure('group_header', background='#e0e0e0', font=('Helvetica', 10, 'bold'))
        self.tree.tag_configure('original', foreground='green')
        self.tree.tag_configure('duplicate', foreground='red')


    def delete_selected(self):
        """Supprime les doublons en gardant le premier de chaque groupe."""
         # Only attempt to delete if the GUI is running
        if not hasattr(self, 'tree'):
             print("Suppression via GUI non possible, GUI non affichée.")
             return


        if not self.duplicates:
            messagebox.showinfo("Information", "Aucun doublon à supprimer.")
            return

        if not messagebox.askyesno("Confirmation", 
                                   "Voulez-vous vraiment supprimer tous les fichiers marqués en rouge ?\n"
                                   "Cette action est irréversible. Seul le premier fichier de chaque groupe (en vert) sera conservé."):
            return

        files_to_remove = []
        for file_hash, filepaths in self.duplicates.items():
            # On ajoute tous les fichiers sauf le premier à la liste de suppression
            files_to_remove.extend(filepaths[1:])

        delete_files(files_to_remove)
        
        messagebox.showinfo("Opération terminée", f"{len(files_to_remove)} fichiers doublons ont été supprimés.")
        
        # Rafraîchir l'affichage
        self.start_scan()


if __name__ == "__main__":
    # Check if a display is available before trying to start the GUI
    if 'DISPLAY' in os.environ:
        print("GUI environment detected, starting application...")
        app = DuplicateFinderApp()
        app.mainloop()
    else:
        print("No display environment detected. Running in headless mode.")
        print("To use the GUI, please run this code in an environment with a graphical display or set up X11 forwarding.")
        # You could add command-line based functionality here if needed
        # For example, scan a folder specified as an argument and print duplicates
        # For this example, we'll just exit gracefully

Pour citer ce code :

Loyer, Dominique. (2024). doublon.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

doublons_mem_13juin.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import sys
import hashlib
import shutil
import json
import requests
from collections import defaultdict
import time

# --- NOUVEAU: Configuration de la Mémoire ---
MEMORY_FILE = os.path.join(os.path.expanduser('~'), 'Desktop', 'organizer_memory.json')

# =============================================================================
# --- FONCTIONS DE GESTION DE LA MÉMOIRE ---
# =============================================================================

def load_memory():
    """Charge la mémoire depuis le fichier JSON."""
    if os.path.exists(MEMORY_FILE):
        print("\nINFO: Fichier de mémoire trouvé. Chargement des connaissances antérieures...")
        with open(MEMORY_FILE, 'r', encoding='utf-8') as f:
            try:
                return json.load(f)
            except json.JSONDecodeError:
                return {"learned_categories": [], "classification_map": {}} # Fichier corrompu
    print("\nINFO: Aucun fichier de mémoire trouvé. Démarrage avec une table rase.")
    return {"learned_categories": [], "classification_map": {}}

def save_memory(data):
    """Sauvegarde la mémoire dans le fichier JSON."""
    print("\nMise à jour de la mémoire avec les nouvelles connaissances...")
    try:
        with open(MEMORY_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        print(f"Mémoire sauvegardée avec succès dans : {MEMORY_FILE}")
    except Exception as e:
        print(f"[ERREUR] Impossible de sauvegarder la mémoire : {e}")


# =============================================================================
# --- ÉTAPES 1 & 2 (DOUBLONS) ---
# Le code reste identique aux versions précédentes.
# =============================================================================

def calculate_hash(filepath, block_size=65536):
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data: break
                sha256.update(data)
        return sha256.hexdigest()
    except (IOError, FileNotFoundError): return None

def find_duplicates(folder):
    print(f"\n--- ÉTAPE 1: Recherche des doublons dans : {folder} ---")
    hashes = defaultdict(list)
    excluded_dirs = ["Doublons", "Classement_Final"]
    files_to_scan = [os.path.join(dp, fn) for dp, dn, fns in os.walk(folder) for fn in fns if not any(excluded in dp for excluded in excluded_dirs)]
    total_files = len(files_to_scan)
    if total_files == 0: return {}, []
    for i, filepath in enumerate(files_to_scan):
        try:
            sys.stdout.write(f"\rAnalyse des fichiers : {((i + 1) / total_files) * 100:.1f}%")
            sys.stdout.flush()
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash: hashes[file_hash].append(filepath)
        except (FileNotFoundError, OSError): continue
    print("\nAnalyse des doublons terminée.")
    duplicates = {h: p for h, p in hashes.items() if len(p) > 1}
    unique_files = [p[0] for h, p in hashes.items() if len(p) == 1]
    return duplicates, unique_files

def get_readability_score(filename):
    score = len(filename)
    if any(c.isalpha() for c in filename): score += 20
    if filename.split('.')[0].isdigit(): score -= 10
    return score

def process_duplicates_safely(duplicates, base_folder, desktop_path):
    if not duplicates:
        print("\n--- ÉTAPE 2: Traitement des doublons ---\nBonne nouvelle ! Aucun fichier en double à traiter.")
        return []
    print(f"\n--- ÉTAPE 2: Traitement automatique et protégé de {len(duplicates)} groupes de doublons ---")
    duplicates_folder = os.path.join(base_folder, "Doublons")
    os.makedirs(duplicates_folder, exist_ok=True)
    print(f"Les doublons non-protégés seront déplacés dans : {duplicates_folder}")
    files_kept = []
    for group_paths in duplicates.values():
        try:
            if desktop_path and any(p.startswith(desktop_path) for p in group_paths):
                file_to_keep = max((p for p in group_paths if p.startswith(desktop_path)), key=os.path.getmtime)
                final_path = file_to_keep
            else:
                file_to_keep = max(group_paths, key=os.path.getmtime)
                best_name = os.path.basename(max(group_paths, key=lambda p: get_readability_score(os.path.basename(p))))
                original_dir = os.path.dirname(file_to_keep)
                potential_new_path = os.path.join(original_dir, best_name)
                final_path = file_to_keep
                if file_to_keep != potential_new_path and not os.path.exists(potential_new_path):
                    os.rename(file_to_keep, potential_new_path)
                    final_path = potential_new_path
            files_kept.append(final_path)
            for path in group_paths:
                if path != file_to_keep and not (desktop_path and path.startswith(desktop_path)):
                    try:
                        shutil.move(path, os.path.join(duplicates_folder, os.path.basename(path)))
                    except Exception as e: print(f"Erreur déplacement doublon {os.path.basename(path)}: {e}")
        except Exception as e: print(f"\nErreur traitement groupe doublons : {e}")
    print("\nTraitement des doublons terminé.")
    return files_kept

# =============================================================================
# --- ÉTAPE 3: CLASSIFICATION IA (MODIFIÉE POUR UTILISER LA MÉMOIRE) ---
# =============================================================================

def learn_structure_from_path(path_to_learn):
    learned_categories = set()
    if not os.path.isdir(path_to_learn): return learned_categories
    print(f"Analyse de la structure de : {path_to_learn}...")
    for root, dirs, _ in os.walk(path_to_learn):
        for dir_name in dirs:
            if dir_name.startswith('.') or dir_name.lower() in ["attachments", "files", "images"]: continue
            clean_name = dir_name.replace('_', ' ').strip()
            if len(clean_name) > 3: learned_categories.add(clean_name)
    return learned_categories

def propose_categories_with_ai(files_for_ai, learned_categories, api_key):
    """Propose des catégories pour les nouveaux fichiers, par lots et avec nouvelles tentatives."""
    if not files_for_ai:
        return {}

    print(f"\n--- ÉTAPE 3b: Interrogation de l'IA pour {len(files_for_ai)} nouveaux fichiers ---")
    BATCH_SIZE = 75
    MAX_RETRIES = 3
    BACKOFF_FACTOR = 2
    
    ai_classifications = {}
    total_files = len(files_for_ai)
    
    for i in range(0, total_files, BATCH_SIZE):
        # ... (Le code de batching reste le même que v13)
        batch_files = files_for_ai[i:i + BATCH_SIZE]
        batch_filenames = [os.path.basename(f) for f in batch_files]
        current_batch_num = (i // BATCH_SIZE) + 1
        total_batches = (total_files + BATCH_SIZE - 1) // BATCH_SIZE
        print(f"\nTraitement du lot {current_batch_num}/{total_batches}...")
        prompt = (f"En te basant sur cette liste de catégories apprises : {list(learned_categories)}. Pour la liste de noms de fichiers suivante : {json.dumps(batch_filenames)}. Propose la catégorie la plus pertinente pour CHAQUE fichier. Si aucune catégorie ne correspond, invente une catégorie pertinente et concise (2-4 mots). Réponds UNIQUEMENT avec un objet JSON où chaque clé est un nom de fichier et sa valeur est la catégorie proposée.")
        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
        for attempt in range(MAX_RETRIES):
            try:
                response = requests.post(apiUrl, json=payload, timeout=90)
                response.raise_for_status()
                json_text = response.json()['candidates'][0]['content']['parts'][0]['text']
                json_text = json_text.strip().replace("```json", "").replace("```", "")
                batch_classifications = json.loads(json_text)
                ai_classifications.update(batch_classifications)
                print(f"Lot {current_batch_num} traité avec succès.")
                break
            except requests.exceptions.HTTPError as err:
                if err.response.status_code in [500, 503, 504] and (attempt + 1) < MAX_RETRIES:
                    wait_time = BACKOFF_FACTOR * (2 ** attempt)
                    print(f"  -> Échec temporaire (tentative {attempt + 1}/{MAX_RETRIES}): {err.response.status_code}. Nouvelle tentative dans {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"\n[ERREUR HTTP] Échec final du lot {current_batch_num}. Ce lot sera ignoré.")
                    break
            except Exception as e:
                print(f"\n[ERREUR] Échec du lot {current_batch_num} : {e}. Ce lot sera ignoré.")
                break
        time.sleep(1)
        
    return ai_classifications

# =============================================================================
# --- ÉTAPES 4 & 5 (VALIDATION, ORGANISATION) ---
# Le code reste identique.
# =============================================================================

def validate_categories(classification_plan):
    if not classification_plan: return set()
    proposed_categories = sorted(list(set(classification_plan.values())))
    print("\n--- ÉTAPE 4: Validation des catégories proposées ---")
    for i, cat_name in enumerate(proposed_categories): print(f"  {i+1}) {cat_name}")
    validated_categories = set()
    while True:
        try:
            choices_str = input("\nEntrez les numéros des catégories à conserver (ex: 1,3,4), ou 'toutes' : ")
            if choices_str.lower() == 'toutes': return set(proposed_categories)
            chosen_indices = [int(i.strip()) - 1 for i in choices_str.split(',')]
            for index in chosen_indices:
                if 0 <= index < len(proposed_categories): validated_categories.add(proposed_categories[index])
            if validated_categories:
                print("\nCatégories validées :", ", ".join(validated_categories))
                return validated_categories
        except ValueError: print("[ERREUR] Entrée invalide.")

def execute_final_organization(base_folder, classification_plan, validated_categories, desktop_path):
    print("\n--- ÉTAPE 5: Organisation finale des fichiers ---")
    output_root = os.path.join(base_folder, "Classement_Final")
    os.makedirs(output_root, exist_ok=True)
    print(f"Les fichiers seront déplacés et organisés dans : {output_root}")
    for cat in validated_categories: os.makedirs(os.path.join(output_root, cat), exist_ok=True)
    unclassified_dir = os.path.join(output_root, "Non Classé")
    os.makedirs(unclassified_dir, exist_ok=True)
    for i, (original_path, category) in enumerate(classification_plan.items()):
        sys.stdout.write(f"\rDéplacement des fichiers : {((i + 1) / len(classification_plan)) * 100:.1f}%")
        sys.stdout.flush()
        if os.path.exists(original_path) and not (desktop_path and original_path.startswith(desktop_path)):
            target_dir = os.path.join(output_root, category) if category in validated_categories else unclassified_dir
            try: shutil.move(original_path, os.path.join(target_dir, os.path.basename(original_path)))
            except Exception as e: print(f"\n[ERREUR] Impossible de déplacer {original_path}. Erreur: {e}")
    print(f"\n\nL'organisation finale est terminée !\nNote : les fichiers sur votre Bureau n'ont pas été déplacés.")

# =============================================================================
# --- FONCTION PRINCIPALE (main) ---
# =============================================================================

def main():
    try:
        print("--- Assistant d'Organisation v14 (avec Mémoire) ---")
        memory = load_memory()
        
        home = os.path.expanduser('~')
        desktop_path = next((p for p in [os.path.join(home, 'Desktop'), os.path.join(home, 'Bureau')] if os.path.isdir(p)), "")
        if desktop_path: print(f"Règle de protection activée pour le Bureau : {desktop_path}")
        
        api_key = input("Veuillez coller votre clé API Google AI Studio : ").strip()
        
        # --- Apprentissage ---
        obsidian_path = input("1. (Optionnel) Glissez-déposez votre coffre Obsidian pour mettre à jour la mémoire : ").strip("'\"")
        notion_path = input("2. (Optionnel) Glissez-déposez votre export Notion pour mettre à jour la mémoire : ").strip("'\"")
        
        learned_from_folders = learn_structure_from_path(obsidian_path)
        learned_from_folders.update(learn_structure_from_path(notion_path))
        
        # Met à jour les catégories de la mémoire avec les nouvelles apprises
        all_learned_categories = set(memory["learned_categories"]) | learned_from_folders
        if all_learned_categories: print(f"\nConnaissances actuelles : {len(all_learned_categories)} catégories mémorisées.")

        # --- Analyse ---
        folder_path = input("\n3. Maintenant, glissez-déposez le dossier principal à analyser : ").strip("'\"")
        if not os.path.isdir(folder_path): return

        duplicates, unique_files = find_duplicates(folder_path)
        files_kept_after_dedup = process_duplicates_safely(duplicates, folder_path, desktop_path)
        all_files_to_classify = list(set(unique_files + files_kept_after_dedup))
        all_files_to_classify = [f for f in all_files_to_classify if os.path.exists(f)]

        if not all_files_to_classify:
            print("\nIl ne reste aucun fichier à classer. Opération terminée.")
            return

        # --- Classification avec mémoire ---
        print("\n--- ÉTAPE 3a: Consultation de la mémoire pour la classification ---")
        classification_plan = {}
        files_for_ai = []
        classification_map_from_memory = memory["classification_map"]
        for f_path in all_files_to_classify:
            f_name = os.path.basename(f_path)
            if f_name in classification_map_from_memory:
                classification_plan[f_path] = classification_map_from_memory[f_name]
            else:
                files_for_ai.append(f_path)
        print(f"{len(classification_plan)} fichiers classés instantanément grâce à la mémoire.")

        if api_key and files_for_ai:
            ai_results_by_name = propose_categories_with_ai(files_for_ai, all_learned_categories, api_key)
            for f_path in files_for_ai:
                f_name = os.path.basename(f_path)
                classification_plan[f_path] = ai_results_by_name.get(f_name, "Non Classé")
        else:
             for f_path in files_for_ai:
                classification_plan[f_path] = "Non Classé"

        # --- Validation et Exécution ---
        validated_categories = validate_categories(classification_plan)
        if not validated_categories:
            print("\nAucune catégorie validée. Opération annulée.")
            return

        confirm = input("\nPrêt à lancer l'organisation finale ? (o/n) > ").lower()
        if confirm == 'o':
            execute_final_organization(folder_path, classification_plan, validated_categories, desktop_path)
            
            # --- Mise à jour finale de la mémoire ---
            memory["learned_categories"] = sorted(list(validated_categories | all_learned_categories))
            for f_path, category in classification_plan.items():
                if category in validated_categories:
                    memory["classification_map"][os.path.basename(f_path)] = category
            save_memory(memory)
        else:
            print("\nOpération finale annulée. La mémoire n'a pas été mise à jour.")

    except (EOFError, KeyboardInterrupt):
        print("\n\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). doublons_mem_13juin.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

en-fr_13juin25.ipynb

Erreur lors de la génération de la description.

Mots-clés: erreur, api

from google.colab import userdata
userdata.get('HF_TOKEN')

# --- Nouvelle Cellule ---

'Mettre le tokken ici de Hugging Face'

# --- Nouvelle Cellule ---

# Auth sur Hug‐Face si HF_TOKEN est défini dans Kaggle Secrets
try:
    hf_token = UserSecretsClient().get_secret("HF_TOKEN")
    HfFolder.save_token(hf_token)
except:
    pass

# Configuration
MODEL = "Helsinki-NLP/opus-mt-fr-en"
SRC, TGT = "fr", "en"
BATCH = 32
EPOCHS = 3
OUTPUT = "opus-mt-fr-en-colab"

# Chargement et split du dataset
ds = load_dataset("opus_books", "en-fr")
s = ds["train"].train_test_split(0.05, seed=42)
s2 = s["train"].train_test_split(0.05, seed=42)
raw = DatasetDict({"train": s2["train"], "validation": s2["test"], "test": s["test"]})

# Tokenizer
tok = AutoTokenizer.from_pretrained(MODEL)
def preprocess(ex):
    srcs = [t[SRC] for t in ex["translation"]]
    tgts = [t[TGT] for t in ex["translation"]]
    mi = tok(srcs, max_length=128, truncation=True, padding=False)
    lb = tok(text_target=tgts, max_length=128, truncation=True, padding=False)
    mi["labels"] = lb["input_ids"]
    return mi

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)

# Modèle + DataCollator + Métriques
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL).to(device)
dc = DataCollatorForSeq2Seq(tok, model=model)
sacrebleu = evaluate.load("sacrebleu")

def compute_metrics(p):
    preds, labels = p.predictions, p.label_ids
    if isinstance(preds, tuple): preds = preds[0]
    preds = np.where(preds != -100, preds, tok.pad_token_id)
    labels = np.where(labels != -100, labels, tok.pad_token_id)
    dp = tok.batch_decode(preds, skip_special_tokens=True)
    dl = tok.batch_decode(labels, skip_special_tokens=True)
    return {"bleu": sacrebleu.compute(predictions=dp, references=[[l] for l in dl])["score"]}

args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    learning_rate=5e-5,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    push_to_hub=hf_token is not None,
    hub_model_id="USERNAME/" + OUTPUT  # remplace USERNAME par ton nom
)

trainer = Seq2SeqTrainer(
    model=model, args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tok, data_collator=dc,
    compute_metrics=compute_metrics
)

# Lancement
trainer.train()
trainer.save_model()
print(trainer.predict(tokenized["test"], metric_key_prefix="test").metrics)

# Inférence
for s in ["Bonjour le monde", "J'espère BLEU ~40", "Bonne traduction !"]:
    out = model.generate(**tok(s, return_tensors="pt", truncation=True).to(device),
                         max_length=128, num_beams=4)
    print(f"{s} → {tok.decode(out[0], skip_special_tokens=True)}")


# --- Nouvelle Cellule ---

!pip install --upgrade --force-reinstall datasets huggingface_hub fsspec

# --- Nouvelle Cellule ---

from datasets import load_dataset, DatasetDict

# --- Nouvelle Cellule ---

Pour citer ce code :

Loyer, Dominique. (2024). en-fr_13juin25.ipynb [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

export_gemini copie.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# -*- coding: utf-8 -*-

# =============================================================================
# EXPORTATEUR DE CLAVARDAGE GEMINI EN PDF (v2.9 - Installation en 2 temps)
#
# Description :
# Ce script installe ses dépendances au premier lancement, puis s'exécute
# au second lancement. C'est la méthode la plus fiable pour éviter les
# erreurs de 'ModuleNotFoundError' après une installation.
#
# Auteur : Gemini
# Version : 2.9
# =============================================================================

import sys
import subprocess
import os
import time
import json
import platform

# =============================================================================
# DÉPENDANCES ET INSTALLATION AUTOMATIQUE
# =============================================================================
def check_and_install_packages():
    """Vérifie et installe les paquets. Si une installation a lieu,
    le script demandera à être relancé."""
    required_packages = {
        'selenium': 'selenium',
        'stealth': 'selenium-stealth',
        'webdriver_manager': 'webdriver-manager'
    }
    
    print("--- Vérification des dépendances ---")
    installations_made = False
    for package_import, package_name in required_packages.items():
        try:
            __import__(package_import)
            print(f"[✓] {package_name} est déjà installé.")
        except ImportError:
            print(f"[!] {package_name} est manquant. Tentative d'installation...")
            installations_made = True
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
                print(f"[✓] {package_name} a été installé avec succès.")
            except subprocess.CalledProcessError as e:
                print(f"[X] ERREUR : Impossible d'installer {package_name}.")
                exit()
    
    # Si une installation a eu lieu, on quitte pour permettre au script d'être relancé
    if installations_made:
        print("\n---------------------------------------------------------")
        print("  Installation terminée. Veuillez relancer le script.")
        print("---------------------------------------------------------")
        exit()
        
    print("--- Toutes les dépendances sont prêtes. ---\n")

# Lancer la vérification au démarrage du script
check_and_install_packages()

# Les imports sont maintenant faits en toute sécurité
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from stealth import stealth

# =============================================================================
# SECTION DE CONFIGURATION (ACTION REQUISE)
# =============================================================================

# !! IMPORTANT !!
# 1. FERMEZ COMPLÈTEMENT GOOGLE CHROME AVANT DE LANCER LE SCRIPT.
#    (Clic droit sur l'icône dans le Dock -> Quitter)

# 2. Le chemin de votre profil Chrome est déjà configuré.
CHROME_PROFILE_PATH = "/Users/bk280625/Library/Application Support/Google/Chrome"

# 3. Découverte automatique des URLs (recommandé)
AUTO_DISCOVER_URLS = True

# 4. (Optionnel) Liste manuelle si AUTO_DISCOVER_URLS = False
GEMINI_URLS = []

# 5. Dossier où sauvegarder les PDFs.
OUTPUT_DIRECTORY = "Gemini_Exports_PDF"

# 6. Nombre de fois où faire défiler la page pour charger l'historique.
SCROLL_ATTEMPTS = 30


# =============================================================================
# FONCTIONS DU SCRIPT (NE PAS MODIFIER)
# =============================================================================

def setup_driver(profile_path, download_dir):
    """Configure le pilote Chrome avec le gestionnaire auto, le profil et le mode discret."""
    options = webdriver.ChromeOptions()
    options.add_argument(f"user-data-dir={profile_path}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    settings = {
        "recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}],
        "selectedDestinationId": "Save as PDF",
        "version": 2,
        "isHeaderFooterEnabled": False,
        "isCssBackgroundEnabled": True
    }
    prefs = {
        'printing.print_preview_sticky_settings.appState': json.dumps(settings),
        'savefile.default_directory': download_dir,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "download.safebrowsing.enabled": True
    }
    options.add_experimental_option('prefs', prefs)
    options.add_argument('--kiosk-printing')
    
    try:
        print("-> Configuration du pilote Chrome...")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        
        print("-> Activation du mode discret...")
        stealth(driver,
                languages=["fr-FR", "fr"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
                )
        
        return driver
    except Exception as e:
        print(f"ERREUR : Impossible de démarrer Chrome.")
        print(f"Détail de l'erreur : {e}")
        return None

def discover_gemini_urls(driver, scroll_attempts):
    """Navigue et récupère tous les liens de conversation."""
    print("\n--- Démarrage de la découverte automatique des URLs ---")
    driver.get("https://gemini.google.com/app")
    
    wait = WebDriverWait(driver, 60)
    
    try:
        print("-> Attente du chargement de l'interface principale...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "rich-textarea")))
        print("-> Interface principale détectée.")
        time.sleep(2)

        print("-> Attente de l'apparition de l'historique des conversations...")
        first_link_selector = "a[href*='/app/c/']"
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, first_link_selector)))
        print("-> Historique détecté et visible.")
        
    except TimeoutException:
        print("\nERREUR CRITIQUE : Impossible de trouver l'historique.")
        return []

    print(f"-> Défilement pour charger l'historique complet ({scroll_attempts} tentatives)...")
    try:
        scroll_target = driver.find_element(By.CSS_SELECTOR, "nav")
        for i in range(scroll_attempts):
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_target)
            print(f"  Défilement du panneau {i+1}/{scroll_attempts}...")
            time.sleep(2)
    except NoSuchElementException:
        print("-> Panneau de navigation non trouvé, défilement de la page entière...")
        for i in range(scroll_attempts):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print(f"  Défilement de la page {i+1}/{scroll_attempts}...")
            time.sleep(2)

    print("-> Recherche de tous les liens de conversation après défilement...")
    urls = set()
    try:
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/app/c/']")
        for link in links:
            href = link.get_attribute('href')
            if href:
                urls.add(href)
    except Exception as e:
        print(f"ERREUR inattendue lors de la collecte des liens : {e}")
        return []

    print(f"-> {len(urls)} conversations uniques trouvées.")
    return list(urls)


def sanitize_filename(title):
    """Nettoie un titre pour en faire un nom de fichier valide."""
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        title = title.replace(char, '_')
    return title[:150].strip()

def export_chat_to_pdf(driver, url, output_dir):
    """Navigue vers une URL Gemini et l'exporte en PDF."""
    try:
        print(f"\nTraitement de l'URL : {url.split('/')[-1]}")
        driver.get(url)
        print("-> Attente du chargement de la conversation...")
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".message-content")))
        print("-> Conversation chargée.")
        
        time.sleep(2)
        page_title = driver.title
        
        if "Gemini" in page_title and len(page_title) < 20:
             chat_id = url.split('/')[-1]
             file_name = f"gemini_chat_{chat_id}.pdf"
        else:
             file_name = sanitize_filename(page_title) + ".pdf"
        
        print(f"-> Lancement de l'impression en PDF...")
        driver.execute_script("window.print();")
        
        time.sleep(5)
        
        print(f"-> Sauvegardé (probablement) sous : {file_name}")
        return True

    except TimeoutException:
        print("ERREUR : Le chargement de la page de conversation a pris trop de temps.")
        return False
    except Exception as e:
        print(f"ERREUR inattendue lors du traitement de {url}: {e}")
        return False

# =============================================================================
# SCRIPT PRINCIPAL
# =============================================================================

if __name__ == "__main__":
    print("========================================")
    print("  Exportateur de Clavardage Gemini PDF  ")
    print("========================================")

    profile_path = CHROME_PROFILE_PATH
    if not profile_path or not os.path.exists(profile_path):
        print("\nERREUR : Chemin du profil Chrome non trouvé ou invalide.")
        print("Veuillez le renseigner manuellement dans la variable 'CHROME_PROFILE_PATH'.")
        exit()

    print(f"Utilisation du profil Chrome : {profile_path}")
    print("IMPORTANT : Assurez-vous que Google Chrome est complètement fermé.")
    time.sleep(3)

    if not os.path.exists(OUTPUT_DIRECTORY):
        print(f"\nCréation du dossier de sortie : {OUTPUT_DIRECTORY}")
        os.makedirs(OUTPUT_DIRECTORY)
    
    absolute_output_path = os.path.abspath(OUTPUT_DIRECTORY)
    
    driver = setup_driver(profile_path, absolute_output_path)

    if driver:
        urls_to_process = []
        if AUTO_DISCOVER_URLS:
            urls_to_process = discover_gemini_urls(driver, SCROLL_ATTEMPTS)
        else:
            urls_to_process = GEMINI_URLS

        if not urls_to_process:
            print("\nAucune URL à traiter. Fin du script.")
        else:
            print(f"\n--- Démarrage de l'exportation de {len(urls_to_process)} conversations ---")
            success_count = 0
            fail_count = 0
            for i, url in enumerate(urls_to_process):
                print(f"\n--- Progression : Conversation {i+1}/{len(urls_to_process)} ---")
                if export_chat_to_pdf(driver, url, absolute_output_path):
                    success_count += 1
                else:
                    fail_count += 1
            
            print("\n----------------------------------------")
            print("Exportation terminée !")
            print(f"Conversations sauvegardées : {success_count}")
            print(f"Échecs : {fail_count}")
            print(f"Les fichiers PDF se trouvent dans : {absolute_output_path}")
        
        driver.quit()

Pour citer ce code :

Loyer, Dominique. (2024). export_gemini copie.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

export_gemini.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# -*- coding: utf-8 -*-

# =============================================================================
# EXPORTATEUR DE CLAVARDAGE GEMINI EN PDF (v2.9 - Installation en 2 temps)
#
# Description :
# Ce script installe ses dépendances au premier lancement, puis s'exécute
# au second lancement. C'est la méthode la plus fiable pour éviter les
# erreurs de 'ModuleNotFoundError' après une installation.
#
# Auteur : Gemini
# Version : 2.9
# =============================================================================

import sys
import subprocess
import os
import time
import json
import platform

# =============================================================================
# DÉPENDANCES ET INSTALLATION AUTOMATIQUE
# =============================================================================
def check_and_install_packages():
    """Vérifie et installe les paquets. Si une installation a lieu,
    le script demandera à être relancé."""
    required_packages = {
        'selenium': 'selenium',
        'stealth': 'selenium-stealth',
        'webdriver_manager': 'webdriver-manager'
    }
    
    print("--- Vérification des dépendances ---")
    installations_made = False
    for package_import, package_name in required_packages.items():
        try:
            __import__(package_import)
            print(f"[✓] {package_name} est déjà installé.")
        except ImportError:
            print(f"[!] {package_name} est manquant. Tentative d'installation...")
            installations_made = True
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
                print(f"[✓] {package_name} a été installé avec succès.")
            except subprocess.CalledProcessError as e:
                print(f"[X] ERREUR : Impossible d'installer {package_name}.")
                exit()
    
    # Si une installation a eu lieu, on quitte pour permettre au script d'être relancé
    if installations_made:
        print("\n---------------------------------------------------------")
        print("  Installation terminée. Veuillez relancer le script.")
        print("---------------------------------------------------------")
        exit()
        
    print("--- Toutes les dépendances sont prêtes. ---\n")

# Lancer la vérification au démarrage du script
check_and_install_packages()

# Les imports sont maintenant faits en toute sécurité
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from stealth import stealth

# =============================================================================
# SECTION DE CONFIGURATION (ACTION REQUISE)
# =============================================================================

# !! IMPORTANT !!
# 1. FERMEZ COMPLÈTEMENT GOOGLE CHROME AVANT DE LANCER LE SCRIPT.
#    (Clic droit sur l'icône dans le Dock -> Quitter)

# 2. Le chemin de votre profil Chrome est déjà configuré.
CHROME_PROFILE_PATH = "/Users/bk280625/Library/Application Support/Google/Chrome"

# 3. Découverte automatique des URLs (recommandé)
AUTO_DISCOVER_URLS = True

# 4. (Optionnel) Liste manuelle si AUTO_DISCOVER_URLS = False
GEMINI_URLS = []

# 5. Dossier où sauvegarder les PDFs.
OUTPUT_DIRECTORY = "Gemini_Exports_PDF"

# 6. Nombre de fois où faire défiler la page pour charger l'historique.
SCROLL_ATTEMPTS = 30


# =============================================================================
# FONCTIONS DU SCRIPT (NE PAS MODIFIER)
# =============================================================================

def setup_driver(profile_path, download_dir):
    """Configure le pilote Chrome avec le gestionnaire auto, le profil et le mode discret."""
    options = webdriver.ChromeOptions()
    options.add_argument(f"user-data-dir={profile_path}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    settings = {
        "recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}],
        "selectedDestinationId": "Save as PDF",
        "version": 2,
        "isHeaderFooterEnabled": False,
        "isCssBackgroundEnabled": True
    }
    prefs = {
        'printing.print_preview_sticky_settings.appState': json.dumps(settings),
        'savefile.default_directory': download_dir,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "download.safebrowsing.enabled": True
    }
    options.add_experimental_option('prefs', prefs)
    options.add_argument('--kiosk-printing')
    
    try:
        print("-> Configuration du pilote Chrome...")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        
        print("-> Activation du mode discret...")
        stealth(driver,
                languages=["fr-FR", "fr"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
                )
        
        return driver
    except Exception as e:
        print(f"ERREUR : Impossible de démarrer Chrome.")
        print(f"Détail de l'erreur : {e}")
        return None

def discover_gemini_urls(driver, scroll_attempts):
    """Navigue et récupère tous les liens de conversation."""
    print("\n--- Démarrage de la découverte automatique des URLs ---")
    driver.get("https://gemini.google.com/app")
    
    wait = WebDriverWait(driver, 60)
    
    try:
        print("-> Attente du chargement de l'interface principale...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "rich-textarea")))
        print("-> Interface principale détectée.")
        time.sleep(2)

        print("-> Attente de l'apparition de l'historique des conversations...")
        first_link_selector = "a[href*='/app/c/']"
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, first_link_selector)))
        print("-> Historique détecté et visible.")
        
    except TimeoutException:
        print("\nERREUR CRITIQUE : Impossible de trouver l'historique.")
        return []

    print(f"-> Défilement pour charger l'historique complet ({scroll_attempts} tentatives)...")
    try:
        scroll_target = driver.find_element(By.CSS_SELECTOR, "nav")
        for i in range(scroll_attempts):
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_target)
            print(f"  Défilement du panneau {i+1}/{scroll_attempts}...")
            time.sleep(2)
    except NoSuchElementException:
        print("-> Panneau de navigation non trouvé, défilement de la page entière...")
        for i in range(scroll_attempts):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print(f"  Défilement de la page {i+1}/{scroll_attempts}...")
            time.sleep(2)

    print("-> Recherche de tous les liens de conversation après défilement...")
    urls = set()
    try:
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/app/c/']")
        for link in links:
            href = link.get_attribute('href')
            if href:
                urls.add(href)
    except Exception as e:
        print(f"ERREUR inattendue lors de la collecte des liens : {e}")
        return []

    print(f"-> {len(urls)} conversations uniques trouvées.")
    return list(urls)


def sanitize_filename(title):
    """Nettoie un titre pour en faire un nom de fichier valide."""
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        title = title.replace(char, '_')
    return title[:150].strip()

def export_chat_to_pdf(driver, url, output_dir):
    """Navigue vers une URL Gemini et l'exporte en PDF."""
    try:
        print(f"\nTraitement de l'URL : {url.split('/')[-1]}")
        driver.get(url)
        print("-> Attente du chargement de la conversation...")
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".message-content")))
        print("-> Conversation chargée.")
        
        time.sleep(2)
        page_title = driver.title
        
        if "Gemini" in page_title and len(page_title) < 20:
             chat_id = url.split('/')[-1]
             file_name = f"gemini_chat_{chat_id}.pdf"
        else:
             file_name = sanitize_filename(page_title) + ".pdf"
        
        print(f"-> Lancement de l'impression en PDF...")
        driver.execute_script("window.print();")
        
        time.sleep(5)
        
        print(f"-> Sauvegardé (probablement) sous : {file_name}")
        return True

    except TimeoutException:
        print("ERREUR : Le chargement de la page de conversation a pris trop de temps.")
        return False
    except Exception as e:
        print(f"ERREUR inattendue lors du traitement de {url}: {e}")
        return False

# =============================================================================
# SCRIPT PRINCIPAL
# =============================================================================

if __name__ == "__main__":
    print("========================================")
    print("  Exportateur de Clavardage Gemini PDF  ")
    print("========================================")

    profile_path = CHROME_PROFILE_PATH
    if not profile_path or not os.path.exists(profile_path):
        print("\nERREUR : Chemin du profil Chrome non trouvé ou invalide.")
        print("Veuillez le renseigner manuellement dans la variable 'CHROME_PROFILE_PATH'.")
        exit()

    print(f"Utilisation du profil Chrome : {profile_path}")
    print("IMPORTANT : Assurez-vous que Google Chrome est complètement fermé.")
    time.sleep(3)

    if not os.path.exists(OUTPUT_DIRECTORY):
        print(f"\nCréation du dossier de sortie : {OUTPUT_DIRECTORY}")
        os.makedirs(OUTPUT_DIRECTORY)
    
    absolute_output_path = os.path.abspath(OUTPUT_DIRECTORY)
    
    driver = setup_driver(profile_path, absolute_output_path)

    if driver:
        urls_to_process = []
        if AUTO_DISCOVER_URLS:
            urls_to_process = discover_gemini_urls(driver, SCROLL_ATTEMPTS)
        else:
            urls_to_process = GEMINI_URLS

        if not urls_to_process:
            print("\nAucune URL à traiter. Fin du script.")
        else:
            print(f"\n--- Démarrage de l'exportation de {len(urls_to_process)} conversations ---")
            success_count = 0
            fail_count = 0
            for i, url in enumerate(urls_to_process):
                print(f"\n--- Progression : Conversation {i+1}/{len(urls_to_process)} ---")
                if export_chat_to_pdf(driver, url, absolute_output_path):
                    success_count += 1
                else:
                    fail_count += 1
            
            print("\n----------------------------------------")
            print("Exportation terminée !")
            print(f"Conversations sauvegardées : {success_count}")
            print(f"Échecs : {fail_count}")
            print(f"Les fichiers PDF se trouvent dans : {absolute_output_path}")
        
        driver.quit()

Pour citer ce code :

Loyer, Dominique. (2024). export_gemini.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

extraire_fichiers copie (trashed 2025-05-30 11-48-03) copie.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api







<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="dark" data-light-theme="light" data-dark-theme="dark"
  data-a11y-animated-images="system" data-a11y-link-underlines="true"
  
  >



  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">
  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>
  <link rel="preconnect" href="https://avatars.githubusercontent.com">

  


  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-bd1cb5575fff.css" /><link data-color-theme="light" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light-605318cbe3a1.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_dimmed-52a2075571c3.css" /><link data-color-theme="dark_high_contrast" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_high_contrast-bf3988586de0.css" /><link data-color-theme="dark_colorblind" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_colorblind-27a437876a92.css" /><link data-color-theme="light_colorblind" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light_colorblind-97f0dc959f8f.css" /><link data-color-theme="light_high_contrast" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light_high_contrast-708e3a93215a.css" /><link data-color-theme="light_tritanopia" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light_tritanopia-9217138a8d5b.css" /><link data-color-theme="dark_tritanopia" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_tritanopia-4397d91bdb49.css" />

    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-primitives-225433424a87.css" />
    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-93aded0ee8a1.css" />
    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/global-21a7f868f707.css" />
    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/github-15d4b28ab680.css" />
  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/repository-4fce88777fa8.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/code-0210be90f4d3.css" />

  


  <script type="application/json" id="client-env">{"locale":"en","featureFlags":["a11y_quote_reply_fix","allow_subscription_halted_error","contentful_lp_optimize_image","contentful_lp_hero_video_cover_image","copilot_immersive_file_preview","copilot_new_references_ui","copilot_chat_repo_custom_instructions_preview","copilot_chat_immersive_subthreading","copilot_chat_retry_on_error","copilot_chat_opening_thread_switch","copilot_chat_persist_submitted_input","copilot_conversational_ux_history_refs","copilot_chat_shared_chat_input","copilot_editor_upsells","copilot_implicit_context","copilot_no_floating_button","copilot_smell_icebreaker_ux","copilot_spaces_multi_file_picker","copilot_read_shared_conversation","dotcom_chat_client_side_skills","experimentation_azure_variant_endpoint","failbot_handle_non_errors","geojson_azure_maps","ghost_pilot_confidence_truncation_25","ghost_pilot_confidence_truncation_40","github_models_gateway","github_models_o3_mini_streaming","hovercard_accessibility","insert_before_patch","issues_advanced_search","issues_react_remove_placeholders","issues_react_blur_item_picker_on_close","issues_react_include_bots_in_pickers","marketing_pages_search_explore_provider","remove_child_patch","repository_suggester_elastic_search","sample_network_conn_type","swp_enterprise_contact_form","site_copilot_extensions_ga","site_copilot_extensions_hero","site_copilot_vscode_link_update","site_proxima_australia_update","issues_react_create_milestone","issues_react_cache_fix_workaround","lifecycle_label_name_updates","item_picker_new_select_panel","issues_react_assignee_warning"],"login":"DominiqueLoyer"}</script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/wp-runtime-f6db9e9dec0b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_oddbird_popover-polyfill_dist_popover_js-9da652f58479.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_arianotify-polyfill_ariaNotify-polyfill_js-node_modules_github_mi-3abb8f-d7e6bc799724.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_failbot_failbot_ts-4600dbf2d60a.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/environment-f04cb2a9fc8c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_primer_behaviors_dist_esm_index_mjs-0dbb79f97f8f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_selector-observer_dist_index_esm_js-f690fd9ae3d5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_relative-time-element_dist_index_js-f6da4b3fa34c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_auto-complete-element_dist_index_js-node_modules_github_catalyst_-8e9f78-a74b4e0a8a6b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_text-expander-element_dist_index_js-78748950cb0c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_filter-input-element_dist_index_js-node_modules_github_remote-inp-b5f1d7-a1760ffda83d.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_markdown-toolbar-element_dist_index_js-ceef33f593fa.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_file-attachment-element_dist_index_js-node_modules_primer_view-co-c44a69-8094ee2ecc5e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/github-elements-c5fd390b3ba0.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/element-registry-a71c0dc18ea2.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_braintree_browser-detection_dist_browser-detection_js-node_modules_githu-bb80ec-72267f4e3ff9.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_lit-html_lit-html_js-be8cb88f481b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_index_js-node_modules_morphdom_dist_morphdom-e-7c534c-a4a1922eb55f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_turbo_dist_turbo_es2017-esm_js-e3cbe28f1638.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_remote-form_dist_index_js-node_modules_delegated-events_dist_inde-893f9f-6cf3320416b8.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_color-convert_index_js-e3180fe3bcb3.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_quote-selection_dist_index_js-node_modules_github_session-resume_-69cfcc-bc42a18e77d5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_updatable-content_updatable-content_ts-a1563f62660e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_behaviors_task-list_ts-app_assets_modules_github_sso_ts-ui_packages-900dde-035d0557f18e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_sticky-scroll-into-view_ts-3e000c5d31a9.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_behaviors_ajax-error_ts-app_assets_modules_github_behaviors_include-87a4ae-21948f72ce0b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_behaviors_commenting_edit_ts-app_assets_modules_github_behaviors_ht-83c235-e429cff6ceb1.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/behaviors-45dfd869047c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_delegated-events_dist_index_js-node_modules_github_catalyst_lib_index_js-f6223d90c7ba.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/notifications-global-01e85cd1be94.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_index_js-node_modules_github_catalyst_lib_inde-dbbea9-26cce2010167.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/code-menu-1c0aedc134b1.js"></script>
  
  <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/primer-react-9a5713772ca5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/react-core-56b50d0313a2.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/react-lib-f1bca44e0926.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/octicons-react-611691cca2f6.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_emotion_is-prop-valid_dist_emotion-is-prop-valid_esm_js-node_modules_emo-62da9f-2df2f32ec596.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_index_js-node_modules_stacktrace-parser_dist_s-e7dcdd-f7cc96ebae76.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_oddbird_popover-polyfill_dist_popover-fn_js-55fea94174bf.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_dompurify_dist_purify_es_mjs-dd1d3ea6a436.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_lodash-es__Stack_js-node_modules_lodash-es__Uint8Array_js-node_modules_l-4faaa6-4a736fde5c2f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_lodash-es__baseIsEqual_js-8929eb9718d5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_hydro-analytics-client_dist_analytics-client_js-node_modules_gith-40531a-09af0ef9a562.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_aria-live_aria-live_ts-ui_packages_promise-with-resolvers-polyfill_promise-with-r-17c672-34345cb18aac.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_paths_index_ts-89633360933d.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_ref-selector_RefSelector_tsx-7496afc3784d.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_diffs_diff-parts_ts-ui_packages_use-file-tree-tooltip_use-file-tree-tooltip_ts-ui-db0a92-6a1f23f93999.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_commit-attribution_index_ts-ui_packages_commit-checks-status_index_ts-ui_packages-762eaa-c6c7f3dd0990.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_code-view-shared_hooks_use-canonical-object_ts-ui_packages_code-view-shared_hooks-a6859a-7a08291f47af.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_document-metadata_document-metadata_ts-ui_packages_repos-file-tree-view_repos-fil-5db355-fda2073071d3.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_blob-anchor_ts-ui_packages_code-nav_code-nav_ts-ui_packages_filter--8253c1-91468a3354f9.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/react-code-view-1d09e2c36c63.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/react-code-view.ab7d8fac328c00e5e0cc.module.css" />


  <title>Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation</title>



  <meta name="route-pattern" content="/:user_id/:repository/blob/*name(/*path)" data-turbo-transient>
  <meta name="route-controller" content="blob" data-turbo-transient>
  <meta name="route-action" content="show" data-turbo-transient>

    
  <meta name="current-catalog-service-hash" content="f3abb0cc802f3d7b95fc8762b94bdcb13bf39634c40c357301c4aa1d67a256fb">


  <meta name="request-id" content="6867:2056C2:4BD4204:69CA212:67C89D18" data-turbo-transient="true" /><meta name="html-safe-nonce" content="34080f25891217c798bad1f08e21f7a78221dc7776460c0e18ed931c54951727" data-turbo-transient="true" /><meta name="visitor-payload" content="eyJyZWZlcnJlciI6bnVsbCwicmVxdWVzdF9pZCI6IjY4Njc6MjA1NkMyOjRCRDQyMDQ6NjlDQTIxMjo2N0M4OUQxOCIsInZpc2l0b3JfaWQiOiI2MjA5MjM5MzA2ODA5NTgwNzQ2IiwicmVnaW9uX2VkZ2UiOiJpYWQiLCJyZWdpb25fcmVuZGVyIjoiaWFkIn0=" data-turbo-transient="true" /><meta name="visitor-hmac" content="2bc9eae6445b425a0c1587554175b91ba9c1961c64ecb2d282549b8d356c6b44" data-turbo-transient="true" />


    <meta name="hovercard-subject-tag" content="repository:892964695" data-turbo-transient>


  <meta name="github-keyboard-shortcuts" content="repository,source-code,file-tree,copilot" data-turbo-transient="true" />
  

  <meta name="selected-link" value="repo_source" data-turbo-transient>
  <link rel="assets" href="https://github.githubassets.com/">

    <meta name="google-site-verification" content="Apib7-x98H0j5cPqHWwSMm6dNU4GmODRoqxLiDzdx9I">

<meta name="octolytics-url" content="https://collector.github.com/github/collect" /><meta name="octolytics-actor-id" content="10522492" /><meta name="octolytics-actor-login" content="DominiqueLoyer" /><meta name="octolytics-actor-hash" content="b25a0662f961a2e30ec41d42ec859d6d67d5c2cae69ebe1b234fb30a5c9065ba" />

  <meta name="analytics-location" content="/&lt;user-name&gt;/&lt;repo-name&gt;/blob/show" data-turbo-transient="true" />

  




    <meta name="user-login" content="DominiqueLoyer">

  <link rel="sudo-modal" href="/sessions/sudo_modal">

    <meta name="viewport" content="width=device-width">

    

      <meta name="description" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation">

      <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="GitHub">

    <link rel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub">
    <meta property="fb:app_id" content="1401488693436528">
    <meta name="apple-itunes-app" content="app-id=1477376905, app-argument=https://github.com/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py" />

      <meta name="twitter:image" content="https://opengraph.githubassets.com/4ccf9c53fd20503f38ce19dc442ceebebeea1834aa4f9b7cc88751567a72a268/Alimiji/Solr_utilisation" /><meta name="twitter:site" content="@github" /><meta name="twitter:card" content="summary_large_image" /><meta name="twitter:title" content="Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation" /><meta name="twitter:description" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Alimiji/Solr_utilisation" />
  <meta property="og:image" content="https://opengraph.githubassets.com/4ccf9c53fd20503f38ce19dc442ceebebeea1834aa4f9b7cc88751567a72a268/Alimiji/Solr_utilisation" /><meta property="og:image:alt" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Alimiji/Solr_utilisation" /><meta property="og:image:width" content="1200" /><meta property="og:image:height" content="600" /><meta property="og:site_name" content="GitHub" /><meta property="og:type" content="object" /><meta property="og:title" content="Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation" /><meta property="og:url" content="https://github.com/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py" /><meta property="og:description" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Alimiji/Solr_utilisation" />
  


      <link rel="shared-web-socket" href="wss://alive.github.com/_sockets/u/10522492/ws?session=eyJ2IjoiVjMiLCJ1IjoxMDUyMjQ5MiwicyI6MTYwOTYzMTE0MywiYyI6MTAxMzYyMzUxOCwidCI6MTc0MTIwMDY3NX0=--6e334a30dae31fb8dd1f229fcbdc5b8412072ae39d88ce1a32c2e98f9aa1d786" data-refresh-url="/_alive" data-session-id="6423c81dedc4cb8d1d43f54226f2835dbbd8262cefb6a037bf5d0ee16212847a">
      <link rel="shared-web-socket-src" href="/assets-cdn/worker/socket-worker-eff89a71ae86.js">


      <meta name="hostname" content="github.com">


      <meta name="keyboard-shortcuts-preference" content="all">
      <meta name="hovercards-preference" content="true">
      <meta name="announcement-preference-hovercard" content="true">

        <meta name="expected-hostname" content="github.com">


  <meta http-equiv="x-pjax-version" content="9c2ea41b2e2966407bce85d5843887341d2c6ee06caca0302a931d758119da83" data-turbo-track="reload">
  <meta http-equiv="x-pjax-csp-version" content="1387756d457e2f7c930482f0374bab8f35110d772491ea950a7236d69098c3a6" data-turbo-track="reload">
  <meta http-equiv="x-pjax-css-version" content="a30977995814647d0827c66025b8a8c5cb8722c27765b03e9e34bf066d054640" data-turbo-track="reload">
  <meta http-equiv="x-pjax-js-version" content="cb8098555d9ca0e18729a401b0c59c59f63135738be0b69afd65ef93dbba2394" data-turbo-track="reload">

  <meta name="turbo-cache-control" content="no-preview" data-turbo-transient="">

      <meta name="turbo-cache-control" content="no-cache" data-turbo-transient>

    <meta data-hydrostats="publish">
  <meta name="go-import" content="github.com/Alimiji/Solr_utilisation git https://github.com/Alimiji/Solr_utilisation.git">

  <meta name="octolytics-dimension-user_id" content="60366981" /><meta name="octolytics-dimension-user_login" content="Alimiji" /><meta name="octolytics-dimension-repository_id" content="892964695" /><meta name="octolytics-dimension-repository_nwo" content="Alimiji/Solr_utilisation" /><meta name="octolytics-dimension-repository_public" content="true" /><meta name="octolytics-dimension-repository_is_fork" content="false" /><meta name="octolytics-dimension-repository_network_root_id" content="892964695" /><meta name="octolytics-dimension-repository_network_root_nwo" content="Alimiji/Solr_utilisation" />



    

    <meta name="turbo-body-classes" content="logged-in env-production page-responsive">


  <meta name="browser-stats-url" content="https://api.github.com/_private/browser/stats">

  <meta name="browser-errors-url" content="https://api.github.com/_private/browser/errors">

  <link rel="mask-icon" href="https://github.githubassets.com/assets/pinned-octocat-093da3e6fa40.svg" color="#000000">
  <link rel="alternate icon" class="js-site-favicon" type="image/png" href="https://github.githubassets.com/favicons/favicon.png">
  <link rel="icon" class="js-site-favicon" type="image/svg+xml" href="https://github.githubassets.com/favicons/favicon.svg" data-base-href="https://github.githubassets.com/favicons/favicon">

<meta name="theme-color" content="#1e2327">
<meta name="color-scheme" content="dark light" />

  <link rel="apple-touch-icon" href="https://github.githubassets.com/assets/apple-touch-icon-92bd46d04241.png">
  <link rel="apple-touch-icon" sizes="180x180" href="https://github.githubassets.com/assets/apple-touch-icon-180x180-a80b8e11abe2.png">
  <meta name="apple-mobile-web-app-title" content="GitHub">

  <link rel="manifest" href="/manifest.json" crossOrigin="use-credentials">

  </head>

  <body class="logged-in env-production page-responsive" style="word-wrap: break-word;">
    <div data-turbo-body class="logged-in env-production page-responsive" style="word-wrap: break-word;">
      


    <div class="position-relative header-wrapper js-header-wrapper ">
      <a href="#start-of-content" data-skip-target-assigned="false" class="p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content">Skip to content</a>

      <span data-view-component="true" class="progress-pjax-loader Progress position-fixed width-full">
    <span style="width: 0%;" data-view-component="true" class="Progress-item progress-pjax-loader-bar left-0 top-0 color-bg-accent-emphasis"></span>
</span>      
      
      <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_ui-commands_ui-commands_ts-97496b0f52ba.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/keyboard-shortcuts-dialog-ac448fe050d6.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />

<react-partial
  partial-name="keyboard-shortcuts-dialog"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{"docsUrl":"https://docs.github.com/get-started/accessibility/keyboard-shortcuts"}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>




      

          

              <header class="AppHeader" role="banner">
  <h2 class="sr-only">Navigation Menu</h2>

    

    <div class="AppHeader-globalBar pb-2 js-global-bar">
      <div class="AppHeader-globalBar-start">
          <deferred-side-panel data-url="/_side-panels/global">
  <include-fragment data-target="deferred-side-panel.fragment">
      <button aria-label="Open global navigation menu" data-action="click:deferred-side-panel#loadPanel click:deferred-side-panel#panelOpened" data-show-dialog-id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" id="dialog-show-dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" type="button" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button p-0 color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-three-bars Button-visual">
    <path d="M1 2.75A.75.75 0 0 1 1.75 2h12.5a.75.75 0 0 1 0 1.5H1.75A.75.75 0 0 1 1 2.75Zm0 5A.75.75 0 0 1 1.75 7h12.5a.75.75 0 0 1 0 1.5H1.75A.75.75 0 0 1 1 7.75ZM1.75 12h12.5a.75.75 0 0 1 0 1.5H1.75a.75.75 0 0 1 0-1.5Z"></path>
</svg>
</button>

<dialog-helper>
  <dialog data-target="deferred-side-panel.panel" id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" aria-modal="true" aria-labelledby="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-title" aria-describedby="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-small-portrait Overlay--motion-scaleFade Overlay--placement-left SidePanel Overlay--disableScroll">
    <div styles="flex-direction: row;" data-view-component="true" class="Overlay-header">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title sr-only" id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-title">
        Global navigation
      </h1>
            <div data-view-component="true" class="d-flex">
      <div data-view-component="true" class="AppHeader-logo position-relative">
        <svg aria-hidden="true" height="24" viewBox="0 0 24 24" version="1.1" width="24" data-view-component="true" class="octicon octicon-mark-github">
    <path d="M12.5.75C6.146.75 1 5.896 1 12.25c0 5.089 3.292 9.387 7.863 10.91.575.101.79-.244.79-.546 0-.273-.014-1.178-.014-2.142-2.889.532-3.636-.704-3.866-1.35-.13-.331-.69-1.352-1.18-1.625-.402-.216-.977-.748-.014-.762.906-.014 1.553.834 1.769 1.179 1.035 1.74 2.688 1.25 3.349.948.1-.747.402-1.25.733-1.538-2.559-.287-5.232-1.279-5.232-5.678 0-1.25.445-2.285 1.178-3.09-.115-.288-.517-1.467.115-3.048 0 0 .963-.302 3.163 1.179.92-.259 1.897-.388 2.875-.388.977 0 1.955.13 2.875.388 2.2-1.495 3.162-1.179 3.162-1.179.633 1.581.23 2.76.115 3.048.733.805 1.179 1.825 1.179 3.09 0 4.413-2.688 5.39-5.247 5.678.417.36.776 1.05.776 2.128 0 1.538-.014 2.774-.014 3.162 0 .302.216.662.79.547C20.709 21.637 24 17.324 24 12.25 24 5.896 18.854.75 12.5.75Z"></path>
</svg>
</div></div>
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-title">
        <div data-view-component="true" class="Overlay-body d-flex flex-column px-2">    <div data-view-component="true" class="d-flex flex-column mb-3">
        <nav aria-label="Site navigation" data-view-component="true" class="ActionList">
  
  <nav-list>
    <ul data-target="nav-list.topLevelList" data-view-component="true" class="ActionListWrap">
        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-hotkey="g d" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;HOME&quot;,&quot;label&quot;:null}" id="item-566bcd64-5314-4cc1-9879-49a7e769834d" href="/dashboard" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-home">
    <path d="M6.906.664a1.749 1.749 0 0 1 2.187 0l5.25 4.2c.415.332.657.835.657 1.367v7.019A1.75 1.75 0 0 1 13.25 15h-3.5a.75.75 0 0 1-.75-.75V9H7v5.25a.75.75 0 0 1-.75.75h-3.5A1.75 1.75 0 0 1 1 13.25V6.23c0-.531.242-1.034.657-1.366l5.25-4.2Zm1.25 1.171a.25.25 0 0 0-.312 0l-5.25 4.2a.25.25 0 0 0-.094.196v7.019c0 .138.112.25.25.25H5.5V8.25a.75.75 0 0 1 .75-.75h3.5a.75.75 0 0 1 .75.75v5.25h2.75a.25.25 0 0 0 .25-.25V6.23a.25.25 0 0 0-.094-.195Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Home
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-hotkey="g i" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;ISSUES&quot;,&quot;label&quot;:null}" id="item-13c00cba-b6a0-414f-ac68-3e5a2756bb04" href="/issues" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Issues
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-hotkey="g p" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;PULL_REQUESTS&quot;,&quot;label&quot;:null}" id="item-f6846ebc-7fb5-4a24-92c9-2ebd6668a1a9" href="/pulls" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Pull requests
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-item-id="projects" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;PROJECTS&quot;,&quot;label&quot;:null}" id="item-57b9c7dd-fddc-486e-b6e9-ec8e6b7a25d2" href="/projects" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-table">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25ZM6.5 6.5v8h7.75a.25.25 0 0 0 .25-.25V6.5Zm8-1.5V1.75a.25.25 0 0 0-.25-.25H6.5V5Zm-13 1.5v7.75c0 .138.112.25.25.25H5v-8ZM5 5V1.5H1.75a.25.25 0 0 0-.25.25V5Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Projects
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;DISCUSSIONS&quot;,&quot;label&quot;:null}" id="item-d7b5c37d-77d9-4e4f-81c6-af46eb1762fe" href="/discussions" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-comment-discussion">
    <path d="M1.75 1h8.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 10.25 10H7.061l-2.574 2.573A1.458 1.458 0 0 1 2 11.543V10h-.25A1.75 1.75 0 0 1 0 8.25v-5.5C0 1.784.784 1 1.75 1ZM1.5 2.75v5.5c0 .138.112.25.25.25h1a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h3.5a.25.25 0 0 0 .25-.25v-5.5a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25Zm13 2a.25.25 0 0 0-.25-.25h-.5a.75.75 0 0 1 0-1.5h.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 14.25 12H14v1.543a1.458 1.458 0 0 1-2.487 1.03L9.22 12.28a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215l2.22 2.22v-2.19a.75.75 0 0 1 .75-.75h1a.25.25 0 0 0 .25-.25Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Discussions
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;CODESPACES&quot;,&quot;label&quot;:null}" id="item-ec04c48b-5395-41b9-a76f-fc066f0d9c3a" href="https://github.com/codespaces" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-codespaces">
    <path d="M0 11.25c0-.966.784-1.75 1.75-1.75h12.5c.966 0 1.75.784 1.75 1.75v3A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm2-9.5C2 .784 2.784 0 3.75 0h8.5C13.216 0 14 .784 14 1.75v5a1.75 1.75 0 0 1-1.75 1.75h-8.5A1.75 1.75 0 0 1 2 6.75Zm1.75-.25a.25.25 0 0 0-.25.25v5c0 .138.112.25.25.25h8.5a.25.25 0 0 0 .25-.25v-5a.25.25 0 0 0-.25-.25Zm-2 9.5a.25.25 0 0 0-.25.25v3c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-3a.25.25 0 0 0-.25-.25Z"></path><path d="M7 12.75a.75.75 0 0 1 .75-.75h4.5a.75.75 0 0 1 0 1.5h-4.5a.75.75 0 0 1-.75-.75Zm-4 0a.75.75 0 0 1 .75-.75h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1-.75-.75Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Codespaces
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;COPILOT&quot;,&quot;label&quot;:null}" id="item-8552101e-e015-465a-a42b-7132fbad85e3" href="/copilot" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot">
    <path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Copilot
</span>      
</a>
  
</li>

        
          <li role="presentation" aria-hidden="true" data-view-component="true" class="ActionList-sectionDivider"></li>
        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;EXPLORE&quot;,&quot;label&quot;:null}" id="item-d3ef4c1e-5018-4310-90d5-4ffeb6d24b11" href="/explore" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-telescope">
    <path d="M14.184 1.143v-.001l1.422 2.464a1.75 1.75 0 0 1-.757 2.451L3.104 11.713a1.75 1.75 0 0 1-2.275-.702l-.447-.775a1.75 1.75 0 0 1 .53-2.32L11.682.573a1.748 1.748 0 0 1 2.502.57Zm-4.709 9.32h-.001l2.644 3.863a.75.75 0 1 1-1.238.848l-1.881-2.75v2.826a.75.75 0 0 1-1.5 0v-2.826l-1.881 2.75a.75.75 0 1 1-1.238-.848l2.049-2.992a.746.746 0 0 1 .293-.253l1.809-.87a.749.749 0 0 1 .944.252ZM9.436 3.92h-.001l-4.97 3.39.942 1.63 5.42-2.61Zm3.091-2.108h.001l-1.85 1.26 1.505 2.605 2.016-.97a.247.247 0 0 0 .13-.151.247.247 0 0 0-.022-.199l-1.422-2.464a.253.253 0 0 0-.161-.119.254.254 0 0 0-.197.038ZM1.756 9.157a.25.25 0 0 0-.075.33l.447.775a.25.25 0 0 0 .325.1l1.598-.769-.83-1.436-1.465 1Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Explore
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;MARKETPLACE&quot;,&quot;label&quot;:null}" id="item-ddca775f-9cdd-4be3-8339-fd49506a1b9b" href="/marketplace" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-gift">
    <path d="M2 2.75A2.75 2.75 0 0 1 4.75 0c.983 0 1.873.42 2.57 1.232.268.318.497.668.68 1.042.183-.375.411-.725.68-1.044C9.376.42 10.266 0 11.25 0a2.75 2.75 0 0 1 2.45 4h.55c.966 0 1.75.784 1.75 1.75v2c0 .698-.409 1.301-1 1.582v4.918A1.75 1.75 0 0 1 13.25 16H2.75A1.75 1.75 0 0 1 1 14.25V9.332C.409 9.05 0 8.448 0 7.75v-2C0 4.784.784 4 1.75 4h.55c-.192-.375-.3-.8-.3-1.25ZM7.25 9.5H2.5v4.75c0 .138.112.25.25.25h4.5Zm1.5 0v5h4.5a.25.25 0 0 0 .25-.25V9.5Zm0-4V8h5.5a.25.25 0 0 0 .25-.25v-2a.25.25 0 0 0-.25-.25Zm-7 0a.25.25 0 0 0-.25.25v2c0 .138.112.25.25.25h5.5V5.5h-5.5Zm3-4a1.25 1.25 0 0 0 0 2.5h2.309c-.233-.818-.542-1.401-.878-1.793-.43-.502-.915-.707-1.431-.707ZM8.941 4h2.309a1.25 1.25 0 0 0 0-2.5c-.516 0-1 .205-1.43.707-.337.392-.646.975-.879 1.793Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Marketplace
</span>      
</a>
  
</li>

</ul>  </nav-list>
</nav>

        <div data-view-component="true" class="my-3 d-flex flex-justify-center height-full">
          <span data-view-component="true">
  <svg style="box-sizing: content-box; color: var(--color-icon-primary);" width="16" height="16" viewBox="0 0 16 16" fill="none" aria-hidden="true" data-view-component="true" class="anim-rotate">
    <circle cx="8" cy="8" r="7" stroke="currentColor" stroke-opacity="0.25" stroke-width="2" vector-effect="non-scaling-stroke" fill="none" />
    <path d="M15 8a7.002 7.002 0 00-7-7" stroke="currentColor" stroke-width="2" stroke-linecap="round" vector-effect="non-scaling-stroke" />
</svg>    <span class="sr-only">Loading</span>
</span>
</div>
</div>
      <div data-view-component="true" class="flex-1"></div>


      <div data-view-component="true" class="px-2">      <p class="color-fg-subtle text-small text-light">&copy; 2025 GitHub, Inc.</p>

      <div data-view-component="true" class="d-flex flex-wrap text-small text-light">
          <a target="_blank" href="https://github.com/about" data-view-component="true" class="Link mr-2">About</a>
          <a target="_blank" href="https://github.blog" data-view-component="true" class="Link mr-2">Blog</a>
          <a target="_blank" href="https://docs.github.com/site-policy/github-terms/github-terms-of-service" data-view-component="true" class="Link mr-2">Terms</a>
          <a target="_blank" href="https://docs.github.com/site-policy/privacy-policies/github-privacy-statement" data-view-component="true" class="Link mr-2">Privacy</a>
          <a target="_blank" href="https://github.com/security" data-view-component="true" class="Link mr-2">Security</a>
          <a target="_blank" href="https://www.githubstatus.com/" data-view-component="true" class="Link mr-3">Status</a>

</div></div>
</div>
      </scrollable-region>
      
</dialog></dialog-helper>

  </include-fragment>
</deferred-side-panel>

        <a
          class="AppHeader-logo ml-1"
          href="https://github.com/"
          data-hotkey="g d"
          aria-label="Homepage "
          data-turbo="false"
          data-analytics-event="{&quot;category&quot;:&quot;Header&quot;,&quot;action&quot;:&quot;go to dashboard&quot;,&quot;label&quot;:&quot;icon:logo&quot;}"
        >
          <svg height="32" aria-hidden="true" viewBox="0 0 24 24" version="1.1" width="32" data-view-component="true" class="octicon octicon-mark-github v-align-middle">
    <path d="M12.5.75C6.146.75 1 5.896 1 12.25c0 5.089 3.292 9.387 7.863 10.91.575.101.79-.244.79-.546 0-.273-.014-1.178-.014-2.142-2.889.532-3.636-.704-3.866-1.35-.13-.331-.69-1.352-1.18-1.625-.402-.216-.977-.748-.014-.762.906-.014 1.553.834 1.769 1.179 1.035 1.74 2.688 1.25 3.349.948.1-.747.402-1.25.733-1.538-2.559-.287-5.232-1.279-5.232-5.678 0-1.25.445-2.285 1.178-3.09-.115-.288-.517-1.467.115-3.048 0 0 .963-.302 3.163 1.179.92-.259 1.897-.388 2.875-.388.977 0 1.955.13 2.875.388 2.2-1.495 3.162-1.179 3.162-1.179.633 1.581.23 2.76.115 3.048.733.805 1.179 1.825 1.179 3.09 0 4.413-2.688 5.39-5.247 5.678.417.36.776 1.05.776 2.128 0 1.538-.014 2.774-.014 3.162 0 .302.216.662.79.547C20.709 21.637 24 17.324 24 12.25 24 5.896 18.854.75 12.5.75Z"></path>
</svg>
        </a>

          <div class="AppHeader-context" >
  <div class="AppHeader-context-compact">
      <button aria-expanded="false" aria-haspopup="dialog" aria-label="Page context: Alimiji / Solr_utilisation" id="dialog-show-context-region-dialog" data-show-dialog-id="context-region-dialog" type="button" data-view-component="true" class="AppHeader-context-compact-trigger Truncate Button--secondary Button--medium Button box-shadow-none">  <span class="Button-content">
    <span class="Button-label"><span class="AppHeader-context-compact-lead">
                <span class="AppHeader-context-compact-parentItem">Alimiji</span>
                <span class="no-wrap">&nbsp;/</span>

            </span>

            <strong class="AppHeader-context-compact-mainItem d-flex flex-items-center Truncate" >
  <span class="Truncate-text ">Solr_utilisation</span>

</strong></span>
  </span>
</button>

<dialog-helper>
  <dialog id="context-region-dialog" aria-modal="true" aria-labelledby="context-region-dialog-title" aria-describedby="context-region-dialog-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-medium Overlay--motion-scaleFade Overlay--disableScroll">
    <div data-view-component="true" class="Overlay-header">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title " id="context-region-dialog-title">
        Navigate back to
      </h1>
        
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="context-region-dialog" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="context-region-dialog-title">
        <div data-view-component="true" class="Overlay-body">          <ul role="list" class="list-style-none" >
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Alimiji&quot;,&quot;screen_size&quot;:&quot;compact&quot;}" href="/Alimiji" data-view-component="true" class="Link--primary Truncate d-flex flex-items-center py-1">
        <span class="AppHeader-context-item-label Truncate-text ">
            <svg aria-hidden="true" height="12" viewBox="0 0 16 16" version="1.1" width="12" data-view-component="true" class="octicon octicon-person mr-1">
    <path d="M10.561 8.073a6.005 6.005 0 0 1 3.432 5.142.75.75 0 1 1-1.498.07 4.5 4.5 0 0 0-8.99 0 .75.75 0 0 1-1.498-.07 6.004 6.004 0 0 1 3.431-5.142 3.999 3.999 0 1 1 5.123 0ZM10.5 5a2.5 2.5 0 1 0-5 0 2.5 2.5 0 0 0 5 0Z"></path>
</svg>

          Alimiji
        </span>

</a>
    </li>
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Solr_utilisation&quot;,&quot;screen_size&quot;:&quot;compact&quot;}" href="/Alimiji/Solr_utilisation" data-view-component="true" class="Link--primary Truncate d-flex flex-items-center py-1">
        <span class="AppHeader-context-item-label Truncate-text ">
            <svg aria-hidden="true" height="12" viewBox="0 0 16 16" version="1.1" width="12" data-view-component="true" class="octicon octicon-repo mr-1">
    <path d="M2 2.5A2.5 2.5 0 0 1 4.5 0h8.75a.75.75 0 0 1 .75.75v12.5a.75.75 0 0 1-.75.75h-2.5a.75.75 0 0 1 0-1.5h1.75v-2h-8a1 1 0 0 0-.714 1.7.75.75 0 1 1-1.072 1.05A2.495 2.495 0 0 1 2 11.5Zm10.5-1h-8a1 1 0 0 0-1 1v6.708A2.486 2.486 0 0 1 4.5 9h8ZM5 12.25a.25.25 0 0 1 .25-.25h3.5a.25.25 0 0 1 .25.25v3.25a.25.25 0 0 1-.4.2l-1.45-1.087a.249.249 0 0 0-.3 0L5.4 15.7a.25.25 0 0 1-.4-.2Z"></path>
</svg>

          Solr_utilisation
        </span>

</a>
    </li>
</ul>

</div>
      </scrollable-region>
      
</dialog></dialog-helper>
  </div>

  <div class="AppHeader-context-full">
    <nav role="navigation" aria-label="Page context">
      <ul role="list" class="list-style-none" >
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Alimiji&quot;,&quot;screen_size&quot;:&quot;full&quot;}" data-hovercard-type="user" data-hovercard-url="/users/Alimiji/hovercard" data-octo-click="hovercard-link-click" data-octo-dimensions="link_type:self" href="/Alimiji" data-view-component="true" class="AppHeader-context-item">
        <span class="AppHeader-context-item-label  ">

          Alimiji
        </span>

</a>
        <span class="AppHeader-context-item-separator">
          <span class='sr-only'>/</span>
          <svg width="16" height="16" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg" aria-hidden="true">
            <path d="M10.956 1.27994L6.06418 14.7201L5 14.7201L9.89181 1.27994L10.956 1.27994Z" fill="currentcolor"/>
          </svg>
        </span>
    </li>
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Solr_utilisation&quot;,&quot;screen_size&quot;:&quot;full&quot;}" href="/Alimiji/Solr_utilisation" data-view-component="true" class="AppHeader-context-item">
        <span class="AppHeader-context-item-label  ">

          Solr_utilisation
        </span>

</a>
    </li>
</ul>

    </nav>
  </div>
</div>

      </div>
      <div class="AppHeader-globalBar-end">
          <div class="AppHeader-search" >
              


<qbsearch-input class="search-input" data-scope="repo:Alimiji/Solr_utilisation" data-custom-scopes-path="/search/custom_scopes" data-delete-custom-scopes-csrf="rdhqNea0-49_T7LtfH--8cngXgfDZcScRPssEKqah0KHtWNTLIaO2go46nYCM5GJloMJHw1wkvV_ld96PgSHoA" data-max-custom-scopes="10" data-header-redesign-enabled="true" data-initial-value="" data-blackbird-suggestions-path="/search/suggestions" data-jump-to-suggestions-path="/_graphql/GetSuggestedNavigationDestinations" data-current-repository="Alimiji/Solr_utilisation" data-current-org="" data-current-owner="Alimiji" data-logged-in="true" data-copilot-chat-enabled="true" data-nl-search-enabled="false">
  <div
    class="search-input-container search-with-dialog position-relative d-flex flex-row flex-items-center height-auto color-bg-transparent border-0 color-fg-subtle mx-0"
    data-action="click:qbsearch-input#searchInputContainerClicked"
  >
      
            <button type="button" data-action="click:qbsearch-input#handleExpand" class="AppHeader-button AppHeader-search-whenNarrow" aria-label="Search or jump to…" aria-expanded="false" aria-haspopup="dialog">
            <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
          </button>


<div class="AppHeader-search-whenRegular">
  <div class="AppHeader-search-wrap AppHeader-search-wrap--hasTrailing">
    <div class="AppHeader-search-control AppHeader-search-control-overflow">
      <label
        for="AppHeader-searchInput"
        aria-label="Search or jump to…"
        class="AppHeader-search-visual--leading"
      >
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
      </label>

                <button
            type="button"
            data-target="qbsearch-input.inputButton"
            data-action="click:qbsearch-input#handleExpand"
            class="AppHeader-searchButton form-control input-contrast text-left color-fg-subtle no-wrap"
            data-hotkey="s,/"
            data-analytics-event="{&quot;location&quot;:&quot;navbar&quot;,&quot;action&quot;:&quot;searchbar&quot;,&quot;context&quot;:&quot;global&quot;,&quot;tag&quot;:&quot;input&quot;,&quot;label&quot;:&quot;searchbar_input_global_navbar&quot;}"
            aria-describedby="search-error-message-flash"
          >
            <div class="overflow-hidden">
              <span id="qb-input-query" data-target="qbsearch-input.inputButtonText">
                  Type <kbd class="AppHeader-search-kbd">/</kbd> to search
              </span>
            </div>
          </button>

    </div>


  </div>
</div>

    <input type="hidden" name="type" class="js-site-search-type-field">

    
<div class="Overlay--hidden " data-modal-dialog-overlay>
  <modal-dialog data-action="close:qbsearch-input#handleClose cancel:qbsearch-input#handleClose" data-target="qbsearch-input.searchSuggestionsDialog" role="dialog" id="search-suggestions-dialog" aria-modal="true" aria-labelledby="search-suggestions-dialog-header" data-view-component="true" class="Overlay Overlay--width-medium Overlay--height-auto">
      <h1 id="search-suggestions-dialog-header" class="sr-only">Search code, repositories, users, issues, pull requests...</h1>
    <div class="Overlay-body Overlay-body--paddingNone">
      
          <div data-view-component="true">        <div class="search-suggestions position-absolute width-full color-shadow-large border color-fg-default color-bg-default overflow-hidden d-flex flex-column query-builder-container"
          style="border-radius: 12px;"
          data-target="qbsearch-input.queryBuilderContainer"
          hidden
        >
          <!-- '"` --><!-- </textarea></xmp> --></option></form><form id="query-builder-test-form" action="" accept-charset="UTF-8" method="get">
  <query-builder data-target="qbsearch-input.queryBuilder" id="query-builder-query-builder-test" data-filter-key=":" data-view-component="true" class="QueryBuilder search-query-builder">
    <div class="FormControl FormControl--fullWidth">
      <label id="query-builder-test-label" for="query-builder-test" class="FormControl-label sr-only">
        Search
      </label>
      <div
        class="QueryBuilder-StyledInput width-fit "
        data-target="query-builder.styledInput"
      >
          <span id="query-builder-test-leadingvisual-wrap" class="FormControl-input-leadingVisualWrap QueryBuilder-leadingVisualWrap">
            <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search FormControl-input-leadingVisual">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
          </span>
        <div data-target="query-builder.styledInputContainer" class="QueryBuilder-StyledInputContainer">
          <div
            aria-hidden="true"
            class="QueryBuilder-StyledInputContent"
            data-target="query-builder.styledInputContent"
          ></div>
          <div class="QueryBuilder-InputWrapper">
            <div aria-hidden="true" class="QueryBuilder-Sizer" data-target="query-builder.sizer"></div>
            <input id="query-builder-test" name="query-builder-test" value="" autocomplete="off" type="text" role="combobox" spellcheck="false" aria-expanded="false" aria-describedby="validation-d9842016-babf-4885-ab02-96c584c5e287" data-target="query-builder.input" data-action="
          input:query-builder#inputChange
          blur:query-builder#inputBlur
          keydown:query-builder#inputKeydown
          focus:query-builder#inputFocus
        " data-view-component="true" class="FormControl-input QueryBuilder-Input FormControl-medium" />
          </div>
        </div>
          <span class="sr-only" id="query-builder-test-clear">Clear</span>
          <button role="button" id="query-builder-test-clear-button" aria-labelledby="query-builder-test-clear query-builder-test-label" data-target="query-builder.clearButton" data-action="
                click:query-builder#clear
                focus:query-builder#clearButtonFocus
                blur:query-builder#clearButtonBlur
              " variant="small" hidden="hidden" type="button" data-view-component="true" class="Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x-circle-fill Button-visual">
    <path d="M2.343 13.657A8 8 0 1 1 13.658 2.343 8 8 0 0 1 2.343 13.657ZM6.03 4.97a.751.751 0 0 0-1.042.018.751.751 0 0 0-.018 1.042L6.94 8 4.97 9.97a.749.749 0 0 0 .326 1.275.749.749 0 0 0 .734-.215L8 9.06l1.97 1.97a.749.749 0 0 0 1.275-.326.749.749 0 0 0-.215-.734L9.06 8l1.97-1.97a.749.749 0 0 0-.326-1.275.749.749 0 0 0-.734.215L8 6.94Z"></path>
</svg>
</button>

      </div>
      <template id="search-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
</template>

<template id="code-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code">
    <path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215Zm-6.56 0a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
</template>

<template id="file-code-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-file-code">
    <path d="M4 1.75C4 .784 4.784 0 5.75 0h5.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v8.586A1.75 1.75 0 0 1 14.25 15h-9a.75.75 0 0 1 0-1.5h9a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 10 4.25V1.5H5.75a.25.25 0 0 0-.25.25v2.5a.75.75 0 0 1-1.5 0Zm1.72 4.97a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734l1.47-1.47-1.47-1.47a.75.75 0 0 1 0-1.06ZM3.28 7.78 1.81 9.25l1.47 1.47a.751.751 0 0 1-.018 1.042.751.751 0 0 1-1.042.018l-2-2a.75.75 0 0 1 0-1.06l2-2a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Zm8.22-6.218V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path>
</svg>
</template>

<template id="history-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-history">
    <path d="m.427 1.927 1.215 1.215a8.002 8.002 0 1 1-1.6 5.685.75.75 0 1 1 1.493-.154 6.5 6.5 0 1 0 1.18-4.458l1.358 1.358A.25.25 0 0 1 3.896 6H.25A.25.25 0 0 1 0 5.75V2.104a.25.25 0 0 1 .427-.177ZM7.75 4a.75.75 0 0 1 .75.75v2.992l2.028.812a.75.75 0 0 1-.557 1.392l-2.5-1A.751.751 0 0 1 7 8.25v-3.5A.75.75 0 0 1 7.75 4Z"></path>
</svg>
</template>

<template id="repo-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-repo">
    <path d="M2 2.5A2.5 2.5 0 0 1 4.5 0h8.75a.75.75 0 0 1 .75.75v12.5a.75.75 0 0 1-.75.75h-2.5a.75.75 0 0 1 0-1.5h1.75v-2h-8a1 1 0 0 0-.714 1.7.75.75 0 1 1-1.072 1.05A2.495 2.495 0 0 1 2 11.5Zm10.5-1h-8a1 1 0 0 0-1 1v6.708A2.486 2.486 0 0 1 4.5 9h8ZM5 12.25a.25.25 0 0 1 .25-.25h3.5a.25.25 0 0 1 .25.25v3.25a.25.25 0 0 1-.4.2l-1.45-1.087a.249.249 0 0 0-.3 0L5.4 15.7a.25.25 0 0 1-.4-.2Z"></path>
</svg>
</template>

<template id="bookmark-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-bookmark">
    <path d="M3 2.75C3 1.784 3.784 1 4.75 1h6.5c.966 0 1.75.784 1.75 1.75v11.5a.75.75 0 0 1-1.227.579L8 11.722l-3.773 3.107A.751.751 0 0 1 3 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v9.91l3.023-2.489a.75.75 0 0 1 .954 0l3.023 2.49V2.75a.25.25 0 0 0-.25-.25Z"></path>
</svg>
</template>

<template id="plus-circle-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-plus-circle">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm7.25-3.25v2.5h2.5a.75.75 0 0 1 0 1.5h-2.5v2.5a.75.75 0 0 1-1.5 0v-2.5h-2.5a.75.75 0 0 1 0-1.5h2.5v-2.5a.75.75 0 0 1 1.5 0Z"></path>
</svg>
</template>

<template id="circle-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-dot-fill">
    <path d="M8 4a4 4 0 1 1 0 8 4 4 0 0 1 0-8Z"></path>
</svg>
</template>

<template id="trash-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-trash">
    <path d="M11 1.75V3h2.25a.75.75 0 0 1 0 1.5H2.75a.75.75 0 0 1 0-1.5H5V1.75C5 .784 5.784 0 6.75 0h2.5C10.216 0 11 .784 11 1.75ZM4.496 6.675l.66 6.6a.25.25 0 0 0 .249.225h5.19a.25.25 0 0 0 .249-.225l.66-6.6a.75.75 0 0 1 1.492.149l-.66 6.6A1.748 1.748 0 0 1 10.595 15h-5.19a1.75 1.75 0 0 1-1.741-1.575l-.66-6.6a.75.75 0 1 1 1.492-.15ZM6.5 1.75V3h3V1.75a.25.25 0 0 0-.25-.25h-2.5a.25.25 0 0 0-.25.25Z"></path>
</svg>
</template>

<template id="team-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-people">
    <path d="M2 5.5a3.5 3.5 0 1 1 5.898 2.549 5.508 5.508 0 0 1 3.034 4.084.75.75 0 1 1-1.482.235 4 4 0 0 0-7.9 0 .75.75 0 0 1-1.482-.236A5.507 5.507 0 0 1 3.102 8.05 3.493 3.493 0 0 1 2 5.5ZM11 4a3.001 3.001 0 0 1 2.22 5.018 5.01 5.01 0 0 1 2.56 3.012.749.749 0 0 1-.885.954.752.752 0 0 1-.549-.514 3.507 3.507 0 0 0-2.522-2.372.75.75 0 0 1-.574-.73v-.352a.75.75 0 0 1 .416-.672A1.5 1.5 0 0 0 11 5.5.75.75 0 0 1 11 4Zm-5.5-.5a2 2 0 1 0-.001 3.999A2 2 0 0 0 5.5 3.5Z"></path>
</svg>
</template>

<template id="project-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-project">
    <path d="M1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25V1.75C0 .784.784 0 1.75 0ZM1.5 1.75v12.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25ZM11.75 3a.75.75 0 0 1 .75.75v7.5a.75.75 0 0 1-1.5 0v-7.5a.75.75 0 0 1 .75-.75Zm-8.25.75a.75.75 0 0 1 1.5 0v5.5a.75.75 0 0 1-1.5 0ZM8 3a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 3Z"></path>
</svg>
</template>

<template id="pencil-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-pencil">
    <path d="M11.013 1.427a1.75 1.75 0 0 1 2.474 0l1.086 1.086a1.75 1.75 0 0 1 0 2.474l-8.61 8.61c-.21.21-.47.364-.756.445l-3.251.93a.75.75 0 0 1-.927-.928l.929-3.25c.081-.286.235-.547.445-.758l8.61-8.61Zm.176 4.823L9.75 4.81l-6.286 6.287a.253.253 0 0 0-.064.108l-.558 1.953 1.953-.558a.253.253 0 0 0 .108-.064Zm1.238-3.763a.25.25 0 0 0-.354 0L10.811 3.75l1.439 1.44 1.263-1.263a.25.25 0 0 0 0-.354Z"></path>
</svg>
</template>

<template id="copilot-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot">
    <path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path>
</svg>
</template>

<template id="copilot-error-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot-error">
    <path d="M16 11.24c0 .112-.072.274-.21.467L13 9.688V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-.198 0-.388-.009-.571-.029L6.833 5.226a4.01 4.01 0 0 0 .17-.782c.117-.935-.037-1.395-.241-1.614-.193-.206-.637-.413-1.682-.297-.683.076-1.115.231-1.395.415l-1.257-.91c.579-.564 1.413-.877 2.485-.996 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095Zm-5.083-8.707c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Zm2.511 11.074c-1.393.776-3.272 1.428-5.43 1.428-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.18-.455-.241-.963-.252-1.475L.31 4.107A.747.747 0 0 1 0 3.509V3.49a.748.748 0 0 1 .625-.73c.156-.026.306.047.435.139l14.667 10.578a.592.592 0 0 1 .227.264.752.752 0 0 1 .046.249v.022a.75.75 0 0 1-1.19.596Zm-1.367-.991L5.635 7.964a5.128 5.128 0 0 1-.889.073c-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433 1.539 0 3.089-.505 4.063-.934Z"></path>
</svg>
</template>

<template id="workflow-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-workflow">
    <path d="M0 1.75C0 .784.784 0 1.75 0h3.5C6.216 0 7 .784 7 1.75v3.5A1.75 1.75 0 0 1 5.25 7H4v4a1 1 0 0 0 1 1h4v-1.25C9 9.784 9.784 9 10.75 9h3.5c.966 0 1.75.784 1.75 1.75v3.5A1.75 1.75 0 0 1 14.25 16h-3.5A1.75 1.75 0 0 1 9 14.25v-.75H5A2.5 2.5 0 0 1 2.5 11V7h-.75A1.75 1.75 0 0 1 0 5.25Zm1.75-.25a.25.25 0 0 0-.25.25v3.5c0 .138.112.25.25.25h3.5a.25.25 0 0 0 .25-.25v-3.5a.25.25 0 0 0-.25-.25Zm9 9a.25.25 0 0 0-.25.25v3.5c0 .138.112.25.25.25h3.5a.25.25 0 0 0 .25-.25v-3.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
</template>

<template id="book-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-book">
    <path d="M0 1.75A.75.75 0 0 1 .75 1h4.253c1.227 0 2.317.59 3 1.501A3.743 3.743 0 0 1 11.006 1h4.245a.75.75 0 0 1 .75.75v10.5a.75.75 0 0 1-.75.75h-4.507a2.25 2.25 0 0 0-1.591.659l-.622.621a.75.75 0 0 1-1.06 0l-.622-.621A2.25 2.25 0 0 0 5.258 13H.75a.75.75 0 0 1-.75-.75Zm7.251 10.324.004-5.073-.002-2.253A2.25 2.25 0 0 0 5.003 2.5H1.5v9h3.757a3.75 3.75 0 0 1 1.994.574ZM8.755 4.75l-.004 7.322a3.752 3.752 0 0 1 1.992-.572H14.5v-9h-3.495a2.25 2.25 0 0 0-2.25 2.25Z"></path>
</svg>
</template>

<template id="code-review-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code-review">
    <path d="M1.75 1h12.5c.966 0 1.75.784 1.75 1.75v8.5A1.75 1.75 0 0 1 14.25 13H8.061l-2.574 2.573A1.458 1.458 0 0 1 3 14.543V13H1.75A1.75 1.75 0 0 1 0 11.25v-8.5C0 1.784.784 1 1.75 1ZM1.5 2.75v8.5c0 .138.112.25.25.25h2a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h6.5a.25.25 0 0 0 .25-.25v-8.5a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25Zm5.28 1.72a.75.75 0 0 1 0 1.06L5.31 7l1.47 1.47a.751.751 0 0 1-.018 1.042.751.751 0 0 1-1.042.018l-2-2a.75.75 0 0 1 0-1.06l2-2a.75.75 0 0 1 1.06 0Zm2.44 0a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L10.69 7 9.22 5.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
</template>

<template id="codespaces-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-codespaces">
    <path d="M0 11.25c0-.966.784-1.75 1.75-1.75h12.5c.966 0 1.75.784 1.75 1.75v3A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm2-9.5C2 .784 2.784 0 3.75 0h8.5C13.216 0 14 .784 14 1.75v5a1.75 1.75 0 0 1-1.75 1.75h-8.5A1.75 1.75 0 0 1 2 6.75Zm1.75-.25a.25.25 0 0 0-.25.25v5c0 .138.112.25.25.25h8.5a.25.25 0 0 0 .25-.25v-5a.25.25 0 0 0-.25-.25Zm-2 9.5a.25.25 0 0 0-.25.25v3c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-3a.25.25 0 0 0-.25-.25Z"></path><path d="M7 12.75a.75.75 0 0 1 .75-.75h4.5a.75.75 0 0 1 0 1.5h-4.5a.75.75 0 0 1-.75-.75Zm-4 0a.75.75 0 0 1 .75-.75h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1-.75-.75Z"></path>
</svg>
</template>

<template id="comment-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-comment">
    <path d="M1 2.75C1 1.784 1.784 1 2.75 1h10.5c.966 0 1.75.784 1.75 1.75v7.5A1.75 1.75 0 0 1 13.25 12H9.06l-2.573 2.573A1.458 1.458 0 0 1 4 13.543V12H2.75A1.75 1.75 0 0 1 1 10.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h2a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h4.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
</template>

<template id="comment-discussion-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-comment-discussion">
    <path d="M1.75 1h8.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 10.25 10H7.061l-2.574 2.573A1.458 1.458 0 0 1 2 11.543V10h-.25A1.75 1.75 0 0 1 0 8.25v-5.5C0 1.784.784 1 1.75 1ZM1.5 2.75v5.5c0 .138.112.25.25.25h1a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h3.5a.25.25 0 0 0 .25-.25v-5.5a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25Zm13 2a.25.25 0 0 0-.25-.25h-.5a.75.75 0 0 1 0-1.5h.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 14.25 12H14v1.543a1.458 1.458 0 0 1-2.487 1.03L9.22 12.28a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215l2.22 2.22v-2.19a.75.75 0 0 1 .75-.75h1a.25.25 0 0 0 .25-.25Z"></path>
</svg>
</template>

<template id="organization-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-organization">
    <path d="M1.75 16A1.75 1.75 0 0 1 0 14.25V1.75C0 .784.784 0 1.75 0h8.5C11.216 0 12 .784 12 1.75v12.5c0 .085-.006.168-.018.25h2.268a.25.25 0 0 0 .25-.25V8.285a.25.25 0 0 0-.111-.208l-1.055-.703a.749.749 0 1 1 .832-1.248l1.055.703c.487.325.779.871.779 1.456v5.965A1.75 1.75 0 0 1 14.25 16h-3.5a.766.766 0 0 1-.197-.026c-.099.017-.2.026-.303.026h-3a.75.75 0 0 1-.75-.75V14h-1v1.25a.75.75 0 0 1-.75.75Zm-.25-1.75c0 .138.112.25.25.25H4v-1.25a.75.75 0 0 1 .75-.75h2.5a.75.75 0 0 1 .75.75v1.25h2.25a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25ZM3.75 6h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5ZM3 3.75A.75.75 0 0 1 3.75 3h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 3 3.75Zm4 3A.75.75 0 0 1 7.75 6h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 7 6.75ZM7.75 3h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5ZM3 9.75A.75.75 0 0 1 3.75 9h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 3 9.75ZM7.75 9h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5Z"></path>
</svg>
</template>

<template id="rocket-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-rocket">
    <path d="M14.064 0h.186C15.216 0 16 .784 16 1.75v.186a8.752 8.752 0 0 1-2.564 6.186l-.458.459c-.314.314-.641.616-.979.904v3.207c0 .608-.315 1.172-.833 1.49l-2.774 1.707a.749.749 0 0 1-1.11-.418l-.954-3.102a1.214 1.214 0 0 1-.145-.125L3.754 9.816a1.218 1.218 0 0 1-.124-.145L.528 8.717a.749.749 0 0 1-.418-1.11l1.71-2.774A1.748 1.748 0 0 1 3.31 4h3.204c.288-.338.59-.665.904-.979l.459-.458A8.749 8.749 0 0 1 14.064 0ZM8.938 3.623h-.002l-.458.458c-.76.76-1.437 1.598-2.02 2.5l-1.5 2.317 2.143 2.143 2.317-1.5c.902-.583 1.74-1.26 2.499-2.02l.459-.458a7.25 7.25 0 0 0 2.123-5.127V1.75a.25.25 0 0 0-.25-.25h-.186a7.249 7.249 0 0 0-5.125 2.123ZM3.56 14.56c-.732.732-2.334 1.045-3.005 1.148a.234.234 0 0 1-.201-.064.234.234 0 0 1-.064-.201c.103-.671.416-2.273 1.15-3.003a1.502 1.502 0 1 1 2.12 2.12Zm6.94-3.935c-.088.06-.177.118-.266.175l-2.35 1.521.548 1.783 1.949-1.2a.25.25 0 0 0 .119-.213ZM3.678 8.116 5.2 5.766c.058-.09.117-.178.176-.266H3.309a.25.25 0 0 0-.213.119l-1.2 1.95ZM12 5a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
</template>

<template id="shield-check-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-shield-check">
    <path d="m8.533.133 5.25 1.68A1.75 1.75 0 0 1 15 3.48V7c0 1.566-.32 3.182-1.303 4.682-.983 1.498-2.585 2.813-5.032 3.855a1.697 1.697 0 0 1-1.33 0c-2.447-1.042-4.049-2.357-5.032-3.855C1.32 10.182 1 8.566 1 7V3.48a1.75 1.75 0 0 1 1.217-1.667l5.25-1.68a1.748 1.748 0 0 1 1.066 0Zm-.61 1.429.001.001-5.25 1.68a.251.251 0 0 0-.174.237V7c0 1.36.275 2.666 1.057 3.859.784 1.194 2.121 2.342 4.366 3.298a.196.196 0 0 0 .154 0c2.245-.957 3.582-2.103 4.366-3.297C13.225 9.666 13.5 8.358 13.5 7V3.48a.25.25 0 0 0-.174-.238l-5.25-1.68a.25.25 0 0 0-.153 0ZM11.28 6.28l-3.5 3.5a.75.75 0 0 1-1.06 0l-1.5-1.5a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215l.97.97 2.97-2.97a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
</template>

<template id="heart-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-heart">
    <path d="m8 14.25.345.666a.75.75 0 0 1-.69 0l-.008-.004-.018-.01a7.152 7.152 0 0 1-.31-.17 22.055 22.055 0 0 1-3.434-2.414C2.045 10.731 0 8.35 0 5.5 0 2.836 2.086 1 4.25 1 5.797 1 7.153 1.802 8 3.02 8.847 1.802 10.203 1 11.75 1 13.914 1 16 2.836 16 5.5c0 2.85-2.045 5.231-3.885 6.818a22.066 22.066 0 0 1-3.744 2.584l-.018.01-.006.003h-.002ZM4.25 2.5c-1.336 0-2.75 1.164-2.75 3 0 2.15 1.58 4.144 3.365 5.682A20.58 20.58 0 0 0 8 13.393a20.58 20.58 0 0 0 3.135-2.211C12.92 9.644 14.5 7.65 14.5 5.5c0-1.836-1.414-3-2.75-3-1.373 0-2.609.986-3.029 2.456a.749.749 0 0 1-1.442 0C6.859 3.486 5.623 2.5 4.25 2.5Z"></path>
</svg>
</template>

<template id="server-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-server">
    <path d="M1.75 1h12.5c.966 0 1.75.784 1.75 1.75v4c0 .372-.116.717-.314 1 .198.283.314.628.314 1v4a1.75 1.75 0 0 1-1.75 1.75H1.75A1.75 1.75 0 0 1 0 12.75v-4c0-.358.109-.707.314-1a1.739 1.739 0 0 1-.314-1v-4C0 1.784.784 1 1.75 1ZM1.5 2.75v4c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-4a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25Zm.25 5.75a.25.25 0 0 0-.25.25v4c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-4a.25.25 0 0 0-.25-.25ZM7 4.75A.75.75 0 0 1 7.75 4h4.5a.75.75 0 0 1 0 1.5h-4.5A.75.75 0 0 1 7 4.75ZM7.75 10h4.5a.75.75 0 0 1 0 1.5h-4.5a.75.75 0 0 1 0-1.5ZM3 4.75A.75.75 0 0 1 3.75 4h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 3 4.75ZM3.75 10h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5Z"></path>
</svg>
</template>

<template id="globe-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-globe">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM5.78 8.75a9.64 9.64 0 0 0 1.363 4.177c.255.426.542.832.857 1.215.245-.296.551-.705.857-1.215A9.64 9.64 0 0 0 10.22 8.75Zm4.44-1.5a9.64 9.64 0 0 0-1.363-4.177c-.307-.51-.612-.919-.857-1.215a9.927 9.927 0 0 0-.857 1.215A9.64 9.64 0 0 0 5.78 7.25Zm-5.944 1.5H1.543a6.507 6.507 0 0 0 4.666 5.5c-.123-.181-.24-.365-.352-.552-.715-1.192-1.437-2.874-1.581-4.948Zm-2.733-1.5h2.733c.144-2.074.866-3.756 1.58-4.948.12-.197.237-.381.353-.552a6.507 6.507 0 0 0-4.666 5.5Zm10.181 1.5c-.144 2.074-.866 3.756-1.58 4.948-.12.197-.237.381-.353.552a6.507 6.507 0 0 0 4.666-5.5Zm2.733-1.5a6.507 6.507 0 0 0-4.666-5.5c.123.181.24.365.353.552.714 1.192 1.436 2.874 1.58 4.948Z"></path>
</svg>
</template>

<template id="issue-opened-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
</template>

<template id="device-mobile-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-device-mobile">
    <path d="M3.75 0h8.5C13.216 0 14 .784 14 1.75v12.5A1.75 1.75 0 0 1 12.25 16h-8.5A1.75 1.75 0 0 1 2 14.25V1.75C2 .784 2.784 0 3.75 0ZM3.5 1.75v12.5c0 .138.112.25.25.25h8.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25ZM8 13a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path>
</svg>
</template>

<template id="package-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-package">
    <path d="m8.878.392 5.25 3.045c.54.314.872.89.872 1.514v6.098a1.75 1.75 0 0 1-.872 1.514l-5.25 3.045a1.75 1.75 0 0 1-1.756 0l-5.25-3.045A1.75 1.75 0 0 1 1 11.049V4.951c0-.624.332-1.201.872-1.514L7.122.392a1.75 1.75 0 0 1 1.756 0ZM7.875 1.69l-4.63 2.685L8 7.133l4.755-2.758-4.63-2.685a.248.248 0 0 0-.25 0ZM2.5 5.677v5.372c0 .09.047.171.125.216l4.625 2.683V8.432Zm6.25 8.271 4.625-2.683a.25.25 0 0 0 .125-.216V5.677L8.75 8.432Z"></path>
</svg>
</template>

<template id="credit-card-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-credit-card">
    <path d="M10.75 9a.75.75 0 0 0 0 1.5h1.5a.75.75 0 0 0 0-1.5h-1.5Z"></path><path d="M0 3.75C0 2.784.784 2 1.75 2h12.5c.966 0 1.75.784 1.75 1.75v8.5A1.75 1.75 0 0 1 14.25 14H1.75A1.75 1.75 0 0 1 0 12.25ZM14.5 6.5h-13v5.75c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25Zm0-2.75a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25V5h13Z"></path>
</svg>
</template>

<template id="play-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-play">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm4.879-2.773 4.264 2.559a.25.25 0 0 1 0 .428l-4.264 2.559A.25.25 0 0 1 6 10.559V5.442a.25.25 0 0 1 .379-.215Z"></path>
</svg>
</template>

<template id="gift-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-gift">
    <path d="M2 2.75A2.75 2.75 0 0 1 4.75 0c.983 0 1.873.42 2.57 1.232.268.318.497.668.68 1.042.183-.375.411-.725.68-1.044C9.376.42 10.266 0 11.25 0a2.75 2.75 0 0 1 2.45 4h.55c.966 0 1.75.784 1.75 1.75v2c0 .698-.409 1.301-1 1.582v4.918A1.75 1.75 0 0 1 13.25 16H2.75A1.75 1.75 0 0 1 1 14.25V9.332C.409 9.05 0 8.448 0 7.75v-2C0 4.784.784 4 1.75 4h.55c-.192-.375-.3-.8-.3-1.25ZM7.25 9.5H2.5v4.75c0 .138.112.25.25.25h4.5Zm1.5 0v5h4.5a.25.25 0 0 0 .25-.25V9.5Zm0-4V8h5.5a.25.25 0 0 0 .25-.25v-2a.25.25 0 0 0-.25-.25Zm-7 0a.25.25 0 0 0-.25.25v2c0 .138.112.25.25.25h5.5V5.5h-5.5Zm3-4a1.25 1.25 0 0 0 0 2.5h2.309c-.233-.818-.542-1.401-.878-1.793-.43-.502-.915-.707-1.431-.707ZM8.941 4h2.309a1.25 1.25 0 0 0 0-2.5c-.516 0-1 .205-1.43.707-.337.392-.646.975-.879 1.793Z"></path>
</svg>
</template>

<template id="code-square-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code-square">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25Zm7.47 3.97a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L10.69 8 9.22 6.53a.75.75 0 0 1 0-1.06ZM6.78 6.53 5.31 8l1.47 1.47a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215l-2-2a.75.75 0 0 1 0-1.06l2-2a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
</template>

<template id="device-desktop-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-device-desktop">
    <path d="M14.25 1c.966 0 1.75.784 1.75 1.75v7.5A1.75 1.75 0 0 1 14.25 12h-3.727c.099 1.041.52 1.872 1.292 2.757A.752.752 0 0 1 11.25 16h-6.5a.75.75 0 0 1-.565-1.243c.772-.885 1.192-1.716 1.292-2.757H1.75A1.75 1.75 0 0 1 0 10.25v-7.5C0 1.784.784 1 1.75 1ZM1.75 2.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25ZM9.018 12H6.982a5.72 5.72 0 0 1-.765 2.5h3.566a5.72 5.72 0 0 1-.765-2.5Z"></path>
</svg>
</template>

        <div class="position-relative">
                <ul
                  role="listbox"
                  class="ActionListWrap QueryBuilder-ListWrap"
                  aria-label="Suggestions"
                  data-action="
                    combobox-commit:query-builder#comboboxCommit
                    mousedown:query-builder#resultsMousedown
                  "
                  data-target="query-builder.resultsList"
                  data-persist-list=false
                  id="query-builder-test-results"
                ></ul>
        </div>
      <div class="FormControl-inlineValidation" id="validation-d9842016-babf-4885-ab02-96c584c5e287" hidden="hidden">
        <span class="FormControl-inlineValidation--visual">
          <svg aria-hidden="true" height="12" viewBox="0 0 12 12" version="1.1" width="12" data-view-component="true" class="octicon octicon-alert-fill">
    <path d="M4.855.708c.5-.896 1.79-.896 2.29 0l4.675 8.351a1.312 1.312 0 0 1-1.146 1.954H1.33A1.313 1.313 0 0 1 .183 9.058ZM7 7V3H5v4Zm-1 3a1 1 0 1 0 0-2 1 1 0 0 0 0 2Z"></path>
</svg>
        </span>
        <span></span>
</div>    </div>
    <div data-target="query-builder.screenReaderFeedback" aria-live="polite" aria-atomic="true" class="sr-only"></div>
</query-builder></form>
          <div class="d-flex flex-row color-fg-muted px-3 text-small color-bg-default search-feedback-prompt">
            <a target="_blank" href="https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax" data-view-component="true" class="Link color-fg-accent text-normal ml-2">Search syntax tips</a>            <div class="d-flex flex-1"></div>
              <button data-action="click:qbsearch-input#showFeedbackDialog" type="button" data-view-component="true" class="Button--link Button--medium Button color-fg-accent text-normal ml-2">  <span class="Button-content">
    <span class="Button-label">Give feedback</span>
  </span>
</button>
          </div>
        </div>
</div>

    </div>
</modal-dialog></div>
  </div>
  <div data-action="click:qbsearch-input#retract" class="dark-backdrop position-fixed" hidden data-target="qbsearch-input.darkBackdrop"></div>
  <div class="color-fg-default">
    
<dialog-helper>
  <dialog data-target="qbsearch-input.feedbackDialog" data-action="close:qbsearch-input#handleDialogClose cancel:qbsearch-input#handleDialogClose" id="feedback-dialog" aria-modal="true" aria-labelledby="feedback-dialog-title" aria-describedby="feedback-dialog-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-medium Overlay--motion-scaleFade Overlay--disableScroll">
    <div data-view-component="true" class="Overlay-header">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title " id="feedback-dialog-title">
        Provide feedback
      </h1>
        
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="feedback-dialog" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="feedback-dialog-title">
        <div data-view-component="true" class="Overlay-body">        <!-- '"` --><!-- </textarea></xmp> --></option></form><form id="code-search-feedback-form" data-turbo="false" action="/search/feedback" accept-charset="UTF-8" method="post"><input type="hidden" name="authenticity_token" value="ZTpqdXLFfpkJrBjOSS9lfojp8rn-KTyQTEd_-ZizBkbxGBfhStEMP65Qqzpcp8CrgrLzWbatqLgLiTUdLy-Izw" />
          <p>We read every piece of feedback, and take your input very seriously.</p>
          <textarea name="feedback" class="form-control width-full mb-2" style="height: 120px" id="feedback"></textarea>
          <input name="include_email" id="include_email" aria-label="Include my email address so I can be contacted" class="form-control mr-2" type="checkbox">
          <label for="include_email" style="font-weight: normal">Include my email address so I can be contacted</label>
</form></div>
      </scrollable-region>
      <div data-view-component="true" class="Overlay-footer Overlay-footer--alignEnd">          <button data-close-dialog-id="feedback-dialog" type="button" data-view-component="true" class="btn">    Cancel
</button>
          <button form="code-search-feedback-form" data-action="click:qbsearch-input#submitFeedback" type="submit" data-view-component="true" class="btn-primary btn">    Submit feedback
</button>
</div>
</dialog></dialog-helper>

    <custom-scopes data-target="qbsearch-input.customScopesManager">
    
<dialog-helper>
  <dialog data-target="custom-scopes.customScopesModalDialog" data-action="close:qbsearch-input#handleDialogClose cancel:qbsearch-input#handleDialogClose" id="custom-scopes-dialog" aria-modal="true" aria-labelledby="custom-scopes-dialog-title" aria-describedby="custom-scopes-dialog-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-medium Overlay--motion-scaleFade Overlay--disableScroll">
    <div data-view-component="true" class="Overlay-header Overlay-header--divided">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title " id="custom-scopes-dialog-title">
        Saved searches
      </h1>
        <h2 id="custom-scopes-dialog-description" class="Overlay-description">Use saved searches to filter your results more quickly</h2>
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="custom-scopes-dialog" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="custom-scopes-dialog-title">
        <div data-view-component="true" class="Overlay-body">        <div data-target="custom-scopes.customScopesModalDialogFlash"></div>

        <div hidden class="create-custom-scope-form" data-target="custom-scopes.createCustomScopeForm">
        <!-- '"` --><!-- </textarea></xmp> --></option></form><form id="custom-scopes-dialog-form" data-turbo="false" action="/search/custom_scopes" accept-charset="UTF-8" method="post"><input type="hidden" name="authenticity_token" value="SXBkbsF5E9vmK3WWTsbUxkrZi2_4L_TKJkpjKL2T8CG0GsgqpLMUoCX-BVJRhpRS4s-Hzndg1GOD9bJFl9zkWQ" />
          <div data-target="custom-scopes.customScopesModalDialogFlash"></div>

          <input type="hidden" id="custom_scope_id" name="custom_scope_id" data-target="custom-scopes.customScopesIdField">

          <div class="form-group">
            <label for="custom_scope_name">Name</label>
            <auto-check src="/search/custom_scopes/check_name" required only-validate-on-blur="false">
              <input
                type="text"
                name="custom_scope_name"
                id="custom_scope_name"
                data-target="custom-scopes.customScopesNameField"
                class="form-control"
                autocomplete="off"
                placeholder="github-ruby"
                required
                maxlength="50">
              <input type="hidden" value="UeNph_pvhwsQUrYnhK6yIc6X4j_9pyzGcEAdVWpcVElJLXWenI1twHWgXXiu5714PE7tOTmtNZu_ULML2OZ8dA" data-csrf="true" />
            </auto-check>
          </div>

          <div class="form-group">
            <label for="custom_scope_query">Query</label>
            <input
              type="text"
              name="custom_scope_query"
              id="custom_scope_query"
              data-target="custom-scopes.customScopesQueryField"
              class="form-control"
              autocomplete="off"
              placeholder="(repo:mona/a OR repo:mona/b) AND lang:python"
              required
              maxlength="500">
          </div>

          <p class="text-small color-fg-muted">
            To see all available qualifiers, see our <a class="Link--inTextBlock" href="https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax">documentation</a>.
          </p>
</form>        </div>

        <div data-target="custom-scopes.manageCustomScopesForm">
          <div data-target="custom-scopes.list"></div>
        </div>

</div>
      </scrollable-region>
      <div data-view-component="true" class="Overlay-footer Overlay-footer--alignEnd Overlay-footer--divided">          <button data-action="click:custom-scopes#customScopesCancel" type="button" data-view-component="true" class="btn">    Cancel
</button>
          <button form="custom-scopes-dialog-form" data-action="click:custom-scopes#customScopesSubmit" data-target="custom-scopes.customScopesSubmitButton" type="submit" data-view-component="true" class="btn-primary btn">    Create saved search
</button>
</div>
</dialog></dialog-helper>
    </custom-scopes>
  </div>
</qbsearch-input>  <input type="hidden" value="79O93enXNNAI0qKarZtL7YvaPXY__VK5vcLjdbL82TUBiUNHmspXI5YUaAI38KOTMKxmNOgKqeS8FI5qzmTm1A" data-csrf="true" class="js-data-jump-to-suggestions-path-csrf" />


          </div>

        
          <div class="AppHeader-CopilotChat">
    <react-partial-anchor>
      <button id="copilot-chat-header-button" data-target="react-partial-anchor.anchor" data-hotkey="Shift+C" aria-expanded="false" aria-controls="copilot-chat-panel" aria-labelledby="tooltip-76ff6244-0180-4501-884c-3b7667150ca2" type="button" disabled="disabled" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button AppHeader-buttonLeft cursor-wait">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot Button-visual">
    <path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path>
</svg>
</button><tool-tip id="tooltip-76ff6244-0180-4501-884c-3b7667150ca2" for="copilot-chat-header-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Chat with Copilot</tool-tip>

      <template data-target="react-partial-anchor.template">
        <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_react-relay_index_js-3e4c69718bad.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_tanstack_query-core_build_modern_queryObserver_js-node_modules_tanstack_-defd52-843b41414e0e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_micromark-util-sanitize-uri_index_js-node_modules_remark-parse_lib_index-b69642-163efad98dc5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_remark-gfm_lib_index_js-bfb9e2c9eabe.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_react-markdown_lib_index_js-2816acae350e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_tanstack_react-query_build_modern_useQuery_js-node_modules_hast-util-fin-d142e3-fe0e76a2e3fe.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_decorators_js-node_modules_accname_dist_access-b37425-35bd8d94d981.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_combobox-nav_dist_index_js-node_modules_github_hotkey_dist_index_-2c4211-a3b6ffd98cc6.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_item-picker_constants_labels_ts-ui_packages_item-picker_constants_values_ts-ui_pa-163a9a-ee6b1c4387f2.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_item-picker_components_RepositoryPicker_tsx-fed97f53635f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_copilot-chat_utils_copilot-local-storage_ts-ui_packages_hydro-analytics_hydro-ana-74ad7c-cd6ac89814da.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_copilot-chat_utils_copilot-chat-hooks_ts-ui_packages_issue-viewer_utils_queries_ts-8a23643c08a1.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_test-id-props_test-id-props_ts-ui_packages_copilot-markdown_MarkdownRenderer_tsx--cd0d45-16709ea47eec.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/copilot-chat-07129b2860fa.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/copilot-chat.4e64150ee8c92ed63ef0.module.css" />
        <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/copilot-markdown-rendering-f6845e8f5d6b.css" />
        <include-fragment src="/github-copilot/chat?skip_anchor=true"></include-fragment>
      </template>
    </react-partial-anchor>
    <react-partial-anchor>
      <button id="global-copilot-menu-button" data-target="react-partial-anchor.anchor" aria-expanded="false" aria-labelledby="tooltip-5ef2b7ac-6993-4818-becb-564c5a7ff6fe" type="button" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button AppHeader-buttonRight">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-triangle-down Button-visual">
    <path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path>
</svg>
</button><tool-tip id="tooltip-5ef2b7ac-6993-4818-becb-564c5a7ff6fe" for="global-copilot-menu-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Open Copilot…</tool-tip>

      <template data-target="react-partial-anchor.template">
        <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/global-copilot-menu-f997b4b96fc7.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />

<react-partial
  partial-name="global-copilot-menu"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>

      </template>
    </react-partial-anchor>
</div>


        <div class="AppHeader-actions position-relative">
             <react-partial-anchor>
      <button id="global-create-menu-anchor" aria-label="Create something new" data-target="react-partial-anchor.anchor" type="button" disabled="disabled" data-view-component="true" class="AppHeader-button global-create-button cursor-wait Button--secondary Button--medium Button width-auto color-fg-muted">  <span class="Button-content">
      <span class="Button-visual Button-leadingVisual">
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-plus">
    <path d="M7.75 2a.75.75 0 0 1 .75.75V7h4.25a.75.75 0 0 1 0 1.5H8.5v4.25a.75.75 0 0 1-1.5 0V8.5H2.75a.75.75 0 0 1 0-1.5H7V2.75A.75.75 0 0 1 7.75 2Z"></path>
</svg>
      </span>
    <span class="Button-label"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-triangle-down">
    <path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path>
</svg></span>
  </span>
</button><tool-tip id="tooltip-ed5feb45-0058-4d48-889f-fb3ac351e82f" for="global-create-menu-anchor" popover="manual" data-direction="s" data-type="description" data-view-component="true" class="sr-only position-absolute">Create new...</tool-tip>

      <template data-target="react-partial-anchor.template">
        <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_promise-with-resolvers-polyfill_promise-with-resolvers-polyfill_ts-ui_packages_re-8d43b0-ae8dde838777.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/global-create-menu-7510a0ee7657.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />

<react-partial
  partial-name="global-create-menu"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{"createRepo":true,"importRepo":true,"codespaces":true,"gist":true,"createOrg":true,"createProject":false,"createProjectUrl":"/DominiqueLoyer?tab=projects","createLegacyProject":false,"createIssue":false,"org":null,"owner":"Alimiji","repo":"Solr_utilisation"}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>

      </template>
    </react-partial-anchor>


          <a href="/issues" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;ISSUES_HEADER&quot;,&quot;label&quot;:null}" id="icon-button-09e73c72-2cbc-4558-a2bf-cf207d9ca0cb" aria-labelledby="tooltip-9578566a-4e4f-4396-aa33-40d1d3b0cabc" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened Button-visual">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
</a><tool-tip id="tooltip-9578566a-4e4f-4396-aa33-40d1d3b0cabc" for="icon-button-09e73c72-2cbc-4558-a2bf-cf207d9ca0cb" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Your issues</tool-tip>

          <a href="/pulls" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;PULL_REQUESTS_HEADER&quot;,&quot;label&quot;:null}" id="icon-button-4c1d565c-b40d-471a-a91c-c8d37024cb02" aria-labelledby="tooltip-33a1455f-d205-4aeb-82d8-ca553d989408" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request Button-visual">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
</a><tool-tip id="tooltip-33a1455f-d205-4aeb-82d8-ca553d989408" for="icon-button-4c1d565c-b40d-471a-a91c-c8d37024cb02" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Your pull requests</tool-tip>

        </div>

        <notification-indicator data-channel="eyJjIjoibm90aWZpY2F0aW9uLWNoYW5nZWQ6MTA1MjI0OTIiLCJ0IjoxNzQxMjAwNjc1fQ==--0901f66d43c8d0ce4cd5ca93a0af6dabd04455e6ed4ab96f83841cc0cdfbce41" data-indicator-mode="none" data-tooltip-global="You have unread notifications" data-tooltip-unavailable="Notifications are unavailable at the moment." data-tooltip-none="You have no unread notifications" data-header-redesign-enabled="true" data-fetch-indicator-src="/notifications/indicator" data-fetch-indicator-enabled="true" data-view-component="true" class="js-socket-channel">
    <a id="AppHeader-notifications-button" href="/notifications" aria-labelledby="notification-indicator-tooltip" data-hotkey="g n" data-target="notification-indicator.link" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;NOTIFICATIONS_HEADER&quot;,&quot;label&quot;:null}" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button  color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-inbox Button-visual">
    <path d="M2.8 2.06A1.75 1.75 0 0 1 4.41 1h7.18c.7 0 1.333.417 1.61 1.06l2.74 6.395c.04.093.06.194.06.295v4.5A1.75 1.75 0 0 1 14.25 15H1.75A1.75 1.75 0 0 1 0 13.25v-4.5c0-.101.02-.202.06-.295Zm1.61.44a.25.25 0 0 0-.23.152L1.887 8H4.75a.75.75 0 0 1 .6.3L6.625 10h2.75l1.275-1.7a.75.75 0 0 1 .6-.3h2.863L11.82 2.652a.25.25 0 0 0-.23-.152Zm10.09 7h-2.875l-1.275 1.7a.75.75 0 0 1-.6.3h-3.5a.75.75 0 0 1-.6-.3L4.375 9.5H1.5v3.75c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25Z"></path>
</svg>
</a>

    <tool-tip id="notification-indicator-tooltip" data-target="notification-indicator.tooltip" for="AppHeader-notifications-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Notifications</tool-tip>
</notification-indicator>

        <div class="AppHeader-user">
          <deferred-side-panel data-url="/_side-panels/user?repository_id=892964695">
  <include-fragment data-target="deferred-side-panel.fragment">
    <react-partial-anchor
  
>
  <button data-target="react-partial-anchor.anchor" data-login="DominiqueLoyer" aria-label="Open user navigation menu" type="button" data-view-component="true" class="cursor-wait Button--invisible Button--medium Button Button--invisible-noVisuals color-bg-transparent p-0">  <span class="Button-content">
    <span class="Button-label"><img src="https://avatars.githubusercontent.com/u/10522492?v=4" alt="" size="32" height="32" width="32" data-view-component="true" class="avatar circle" /></span>
  </span>
</button>
  <template data-target="react-partial-anchor.template">
    <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/global-user-nav-drawer-487d63bb6986.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/global-user-nav-drawer.830d6c10c9fea7fc134e.module.css" />

<react-partial
  partial-name="global-user-nav-drawer"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{"owner":{"login":"DominiqueLoyer","name":"D   Ф  m  i  И  i  q   ц  e L  Ф   y   e   r","avatarUrl":"https://avatars.githubusercontent.com/u/10522492?v=4"},"drawerId":"global-user-nav-drawer","lazyLoadItemDataFetchUrl":"/_side-panels/user.json","canAddAccount":true,"addAccountPath":"/login?add_account=1\u0026return_to=https%3A%2F%2Fgithub.com%2FAlimiji%2FSolr_utilisation%2Fblob%2Fmain%2Fextraire_fichiers.py","switchAccountPath":"/switch_account","loginAccountPath":"/login?add_account=1","projectsPath":"/DominiqueLoyer?tab=projects","gistsUrl":"https://gist.github.com/mine","docsUrl":"https://docs.github.com","yourEnterpriseUrl":null,"enterpriseSettingsUrl":null,"supportUrl":"https://support.github.com","showAccountSwitcher":true,"showCopilot":true,"showEnterprises":true,"showEnterprise":false,"showGists":true,"showOrganizations":true,"showSponsors":true,"showUpgrade":true,"showFeaturesPreviews":true,"showEnterpriseSettings":false,"createMenuProps":{"createRepo":true,"importRepo":true,"codespaces":true,"gist":true,"createOrg":true,"createProject":false,"createProjectUrl":"/DominiqueLoyer?tab=projects","createLegacyProject":false,"createIssue":false,"org":null,"owner":"Alimiji","repo":"Solr_utilisation"}}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>

  </template>
</react-partial-anchor>

  </include-fragment>
</deferred-side-panel>
        </div>

        <div class="position-absolute mt-2">
            
<site-header-logged-in-user-menu>

</site-header-logged-in-user-menu>

        </div>
      </div>
    </div>


    
        <div class="AppHeader-localBar" >
          <nav data-pjax="#js-repo-pjax-container" aria-label="Repository" data-view-component="true" class="js-repo-nav js-sidenav-container-pjax js-responsive-underlinenav overflow-hidden UnderlineNav">

  <ul data-view-component="true" class="UnderlineNav-body list-style-none">
      <li data-view-component="true" class="d-inline-flex">
  <a id="code-tab" href="/Alimiji/Solr_utilisation" data-tab-item="i0code-tab" data-selected-links="repo_source repo_downloads repo_commits repo_releases repo_tags repo_branches repo_packages repo_deployments repo_attestations /Alimiji/Solr_utilisation" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g c" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Code&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code UnderlineNav-octicon d-none d-sm-inline">
    <path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215Zm-6.56 0a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
        <span data-content="Code">Code</span>
          <span id="code-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="Not available" data-view-component="true" class="Counter"></span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="issues-tab" href="/Alimiji/Solr_utilisation/issues" data-tab-item="i1issues-tab" data-selected-links="repo_issues repo_labels repo_milestones /Alimiji/Solr_utilisation/issues" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g i" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Issues&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened UnderlineNav-octicon d-none d-sm-inline">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
        <span data-content="Issues">Issues</span>
          <span id="issues-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="0" hidden="hidden" data-view-component="true" class="Counter">0</span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="pull-requests-tab" href="/Alimiji/Solr_utilisation/pulls" data-tab-item="i2pull-requests-tab" data-selected-links="repo_pulls checks /Alimiji/Solr_utilisation/pulls" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g p" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Pull requests&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request UnderlineNav-octicon d-none d-sm-inline">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
        <span data-content="Pull requests">Pull requests</span>
          <span id="pull-requests-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="0" hidden="hidden" data-view-component="true" class="Counter">0</span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="actions-tab" href="/Alimiji/Solr_utilisation/actions" data-tab-item="i3actions-tab" data-selected-links="repo_actions /Alimiji/Solr_utilisation/actions" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g a" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Actions&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-play UnderlineNav-octicon d-none d-sm-inline">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm4.879-2.773 4.264 2.559a.25.25 0 0 1 0 .428l-4.264 2.559A.25.25 0 0 1 6 10.559V5.442a.25.25 0 0 1 .379-.215Z"></path>
</svg>
        <span data-content="Actions">Actions</span>
          <span id="actions-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="Not available" data-view-component="true" class="Counter"></span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="projects-tab" href="/Alimiji/Solr_utilisation/projects" data-tab-item="i4projects-tab" data-selected-links="repo_projects new_repo_project repo_project /Alimiji/Solr_utilisation/projects" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g b" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Projects&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-table UnderlineNav-octicon d-none d-sm-inline">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25ZM6.5 6.5v8h7.75a.25.25 0 0 0 .25-.25V6.5Zm8-1.5V1.75a.25.25 0 0 0-.25-.25H6.5V5Zm-13 1.5v7.75c0 .138.112.25.25.25H5v-8ZM5 5V1.5H1.75a.25.25 0 0 0-.25.25V5Z"></path>
</svg>
        <span data-content="Projects">Projects</span>
          <span id="projects-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="0" hidden="hidden" data-view-component="true" class="Counter">0</span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="security-tab" href="/Alimiji/Solr_utilisation/security" data-tab-item="i5security-tab" data-selected-links="security overview alerts policy token_scanning code_scanning /Alimiji/Solr_utilisation/security" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g s" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Security&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-shield UnderlineNav-octicon d-none d-sm-inline">
    <path d="M7.467.133a1.748 1.748 0 0 1 1.066 0l5.25 1.68A1.75 1.75 0 0 1 15 3.48V7c0 1.566-.32 3.182-1.303 4.682-.983 1.498-2.585 2.813-5.032 3.855a1.697 1.697 0 0 1-1.33 0c-2.447-1.042-4.049-2.357-5.032-3.855C1.32 10.182 1 8.566 1 7V3.48a1.75 1.75 0 0 1 1.217-1.667Zm.61 1.429a.25.25 0 0 0-.153 0l-5.25 1.68a.25.25 0 0 0-.174.238V7c0 1.358.275 2.666 1.057 3.86.784 1.194 2.121 2.34 4.366 3.297a.196.196 0 0 0 .154 0c2.245-.956 3.582-2.104 4.366-3.298C13.225 9.666 13.5 8.36 13.5 7V3.48a.251.251 0 0 0-.174-.237l-5.25-1.68ZM8.75 4.75v3a.75.75 0 0 1-1.5 0v-3a.75.75 0 0 1 1.5 0ZM9 10.5a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
        <span data-content="Security">Security</span>
          <include-fragment src="/Alimiji/Solr_utilisation/security/overall-count" accept="text/fragment+html"></include-fragment>

    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="insights-tab" href="/Alimiji/Solr_utilisation/pulse" data-tab-item="i6insights-tab" data-selected-links="repo_graphs repo_contributors dependency_graph dependabot_updates pulse people community /Alimiji/Solr_utilisation/pulse" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Insights&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-graph UnderlineNav-octicon d-none d-sm-inline">
    <path d="M1.5 1.75V13.5h13.75a.75.75 0 0 1 0 1.5H.75a.75.75 0 0 1-.75-.75V1.75a.75.75 0 0 1 1.5 0Zm14.28 2.53-5.25 5.25a.75.75 0 0 1-1.06 0L7 7.06 4.28 9.78a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042l3.25-3.25a.75.75 0 0 1 1.06 0L10 7.94l4.72-4.72a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
        <span data-content="Insights">Insights</span>
          <span id="insights-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="Not available" data-view-component="true" class="Counter"></span>


    
</a></li>
</ul>
    <div style="visibility:hidden;" data-view-component="true" class="UnderlineNav-actions js-responsive-underlinenav-overflow position-absolute pr-3 pr-md-4 pr-lg-5 right-0">      <action-menu data-select-variant="none" data-view-component="true">
  <focus-group direction="vertical" mnemonics retain>
    <button id="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" popovertarget="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-overlay" aria-controls="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-list" aria-haspopup="true" aria-labelledby="tooltip-dadfd319-a6a7-4026-ba2c-925ac8df240a" type="button" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium UnderlineNav-item">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-kebab-horizontal Button-visual">
    <path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path>
</svg>
</button><tool-tip id="tooltip-dadfd319-a6a7-4026-ba2c-925ac8df240a" for="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Additional navigation options</tool-tip>


<anchored-position data-target="action-menu.overlay" id="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-overlay" anchor="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" align="start" side="outside-bottom" anchor-offset="normal" popover="auto" data-view-component="true">
  <div data-view-component="true" class="Overlay Overlay--size-auto">
    
      <div data-view-component="true" class="Overlay-body Overlay-body--paddingNone">          <action-list>
  <div data-view-component="true">
    <ul aria-labelledby="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" id="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-list" role="menu" data-view-component="true" class="ActionListWrap--inset ActionListWrap">
        <li hidden="hidden" data-menu-item="i0code-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-447cc7aa-1f1d-473b-8b22-dd49e5fdb4eb" href="/Alimiji/Solr_utilisation" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code">
    <path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215Zm-6.56 0a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Code
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i1issues-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-6ce50039-f514-4487-9611-7e50fc0a2770" href="/Alimiji/Solr_utilisation/issues" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Issues
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i2pull-requests-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-5331a6e8-a343-487c-86ce-cabff242c988" href="/Alimiji/Solr_utilisation/pulls" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Pull requests
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i3actions-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-e4b375d2-3534-4e26-a709-1a89e4dce0a4" href="/Alimiji/Solr_utilisation/actions" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-play">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm4.879-2.773 4.264 2.559a.25.25 0 0 1 0 .428l-4.264 2.559A.25.25 0 0 1 6 10.559V5.442a.25.25 0 0 1 .379-.215Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Actions
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i4projects-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-2c57d4d6-8b45-47d6-b1b1-ccea2b3d0551" href="/Alimiji/Solr_utilisation/projects" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-table">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25ZM6.5 6.5v8h7.75a.25.25 0 0 0 .25-.25V6.5Zm8-1.5V1.75a.25.25 0 0 0-.25-.25H6.5V5Zm-13 1.5v7.75c0 .138.112.25.25.25H5v-8ZM5 5V1.5H1.75a.25.25 0 0 0-.25.25V5Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Projects
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i5security-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-7103d367-6e5a-4d9d-87ba-d22662d6b88c" href="/Alimiji/Solr_utilisation/security" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-shield">
    <path d="M7.467.133a1.748 1.748 0 0 1 1.066 0l5.25 1.68A1.75 1.75 0 0 1 15 3.48V7c0 1.566-.32 3.182-1.303 4.682-.983 1.498-2.585 2.813-5.032 3.855a1.697 1.697 0 0 1-1.33 0c-2.447-1.042-4.049-2.357-5.032-3.855C1.32 10.182 1 8.566 1 7V3.48a1.75 1.75 0 0 1 1.217-1.667Zm.61 1.429a.25.25 0 0 0-.153 0l-5.25 1.68a.25.25 0 0 0-.174.238V7c0 1.358.275 2.666 1.057 3.86.784 1.194 2.121 2.34 4.366 3.297a.196.196 0 0 0 .154 0c2.245-.956 3.582-2.104 4.366-3.298C13.225 9.666 13.5 8.36 13.5 7V3.48a.251.251 0 0 0-.174-.237l-5.25-1.68ZM8.75 4.75v3a.75.75 0 0 1-1.5 0v-3a.75.75 0 0 1 1.5 0ZM9 10.5a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Security
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i6insights-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-f9ba8c15-48f3-4c6b-ad7a-49339f0de79b" href="/Alimiji/Solr_utilisation/pulse" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-graph">
    <path d="M1.5 1.75V13.5h13.75a.75.75 0 0 1 0 1.5H.75a.75.75 0 0 1-.75-.75V1.75a.75.75 0 0 1 1.5 0Zm14.28 2.53-5.25 5.25a.75.75 0 0 1-1.06 0L7 7.06 4.28 9.78a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042l3.25-3.25a.75.75 0 0 1 1.06 0L10 7.94l4.72-4.72a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Insights
</span>      
</a>
  
</li>
</ul>    
</div></action-list>


</div>
      
</div></anchored-position>  </focus-group>
</action-menu></div>
</nav>
        </div>
</header>


      <div hidden="hidden" data-view-component="true" class="js-stale-session-flash stale-session-flash flash flash-warn flash-full">
  
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-alert">
    <path d="M6.457 1.047c.659-1.234 2.427-1.234 3.086 0l6.082 11.378A1.75 1.75 0 0 1 14.082 15H1.918a1.75 1.75 0 0 1-1.543-2.575Zm1.763.707a.25.25 0 0 0-.44 0L1.698 13.132a.25.25 0 0 0 .22.368h12.164a.25.25 0 0 0 .22-.368Zm.53 3.996v2.5a.75.75 0 0 1-1.5 0v-2.5a.75.75 0 0 1 1.5 0ZM9 11a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
        <span class="js-stale-session-flash-signed-in" hidden>You signed in with another tab or window. <a class="Link--inTextBlock" href="">Reload</a> to refresh your session.</span>
        <span class="js-stale-session-flash-signed-out" hidden>You signed out in another tab or window. <a class="Link--inTextBlock" href="">Reload</a> to refresh your session.</span>
        <span class="js-stale-session-flash-switched" hidden>You switched accounts on another tab or window. <a class="Link--inTextBlock" href="">Reload</a> to refresh your session.</span>

    <button id="icon-button-e0c8af87-0725-484e-a33e-7b75baf9efdc" aria-labelledby="tooltip-46b6559d-b6fb-476c-b0f4-18aa3691ae5c" type="button" data-view-component="true" class="Button Button--iconOnly Button--invisible Button--medium flash-close js-flash-close">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x Button-visual">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
</button><tool-tip id="tooltip-46b6559d-b6fb-476c-b0f4-18aa3691ae5c" for="icon-button-e0c8af87-0725-484e-a33e-7b75baf9efdc" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Dismiss alert</tool-tip>


  
</div>
          
    </div>

  <div id="start-of-content" class="show-on-focus"></div>








    <div id="js-flash-container" class="flash-container" data-turbo-replace>




  <template class="js-flash-template">
    
<div class="flash flash-full   {{ className }}">
  <div >
    <button autofocus class="flash-close js-flash-close" type="button" aria-label="Dismiss this message">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
    </button>
    <div aria-atomic="true" role="alert" class="js-flash-alert">
      
      <div>{{ message }}</div>

    </div>
  </div>
</div>
  </template>
</div>


    
  <notification-shelf-watcher data-base-url="https://github.com/notifications/beta/shelf" data-channel="eyJjIjoibm90aWZpY2F0aW9uLWNoYW5nZWQ6MTA1MjI0OTIiLCJ0IjoxNzQxMjAwNjc1fQ==--0901f66d43c8d0ce4cd5ca93a0af6dabd04455e6ed4ab96f83841cc0cdfbce41" data-view-component="true" class="js-socket-channel"></notification-shelf-watcher>
  <div hidden data-initial data-target="notification-shelf-watcher.placeholder"></div>






  <div
    class="application-main "
    data-commit-hovercards-enabled
    data-discussion-hovercards-enabled
    data-issue-and-pr-hovercards-enabled
    data-project-hovercards-enabled
  >
        <div itemscope itemtype="http://schema.org/SoftwareSourceCode" class="">
    <main id="js-repo-pjax-container" >
      
      






    
  <div id="repository-container-header" data-turbo-replace hidden ></div>




<turbo-frame id="repo-content-turbo-frame" target="_top" data-turbo-action="advance" class="">
    <div id="repo-content-pjax-container" class="repository-content " >
      <a href="https://github.dev/" class="d-none js-github-dev-shortcut" data-hotkey=".,Mod+Alt+.">Open in github.dev</a>
  <a href="https://github.dev/" class="d-none js-github-dev-new-tab-shortcut" data-hotkey="Shift+.,Shift+&gt;,&gt;" target="_blank" rel="noopener noreferrer">Open in a new github.dev tab</a>
    <a class="d-none" data-hotkey=",,Mod+Alt+," target="_blank" href="/codespaces/new/Alimiji/Solr_utilisation/tree/main?resume=1">Open in codespace</a>




    
      
    








<react-app
  app-name="react-code-view"
  initial-path="/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py"
    style="display: block; min-height: calc(100vh - 64px);"
  data-attempted-ssr="true"
  data-ssr="true"
  data-lazy="false"
  data-alternate="false"
  data-data-router-enabled="false"
>
  
  <script type="application/json" data-target="react-app.embeddedData">{"payload":{"allShortcutsEnabled":true,"fileTree":{"":{"items":[{"name":"RI_PySolr (1).pdf","path":"RI_PySolr (1).pdf","contentType":"file"},{"name":"commande_curl_solr.pdf","path":"commande_curl_solr.pdf","contentType":"file"},{"name":"extraire_fichiers.py","path":"extraire_fichiers.py","contentType":"file"},{"name":"main.py","path":"main.py","contentType":"file"},{"name":"notes.txt","path":"notes.txt","contentType":"file"},{"name":"precision_recall.py","path":"precision_recall.py","contentType":"file"},{"name":"requete_resultat_solr.py","path":"requete_resultat_solr.py","contentType":"file"},{"name":"requetes.py","path":"requetes.py","contentType":"file"}],"totalCount":8}},"fileTreeProcessingTime":8.986566,"foldersToFetch":[],"repo":{"id":892964695,"defaultBranch":"main","name":"Solr_utilisation","ownerLogin":"Alimiji","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2024-11-23T01:55:01.000-05:00","ownerAvatar":"https://avatars.githubusercontent.com/u/60366981?v=4","public":true,"private":false,"isOrgOwned":false},"codeLineWrapEnabled":false,"symbolsExpanded":true,"treeExpanded":true,"refInfo":{"name":"main","listCacheKey":"v0:1732795331.0","canEdit":true,"refType":"branch","currentOid":"cc5f36a532d1b33c2bf129f168d2219e9539a097"},"path":"extraire_fichiers.py","currentUser":{"id":10522492,"login":"DominiqueLoyer","userEmail":"loyer.dominique@courrier.uqam.ca"},"blob":{"rawLines":["import json","import os","import gzip","import shutil","from requetes import extraire_requetes_longues, extraire_requetes_courtes","","import os","import gzip","import shutil","","import os","import gzip","import shutil","","# Chemins des dossiers","input_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP'","output_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP_ok'","","# Vérifie que le dossier de sortie existe, sinon le crée","os.makedirs(output_folder, exist_ok=True)","","import os","","import os","import chardet","","# Chemins des dossiers","input_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP__ok'","output_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP_fixed'","","# Crée le dossier de sortie s'il n'existe pas","os.makedirs(output_folder, exist_ok=True)","","","# Fonction pour détecter l'encodage","def detect_encoding(file_path):","    with open(file_path, 'rb') as f:","        result = chardet.detect(f.read())","        return result['encoding']","","","# Fonction pour transformer un document en structure XML Solr","def transform_document(lines):","    doc_lines = []","    doc_lines.append(\"  \u003cdoc\u003e\")  # Début du document","    for line in lines:","        line = line.strip()","        if line.startswith(\"\u003cDOCNO\u003e\"):","            content = line.replace(\"\u003cDOCNO\u003e\", \"\").replace(\"\u003c/DOCNO\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"DOCNO\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cFILEID\u003e\"):","            content = line.replace(\"\u003cFILEID\u003e\", \"\").replace(\"\u003c/FILEID\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"FILEID\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cFIRST\u003e\"):","            content = line.replace(\"\u003cFIRST\u003e\", \"\").replace(\"\u003c/FIRST\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"FIRST\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cSECOND\u003e\"):","            content = line.replace(\"\u003cSECOND\u003e\", \"\").replace(\"\u003c/SECOND\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"SECOND\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cHEAD\u003e\"):","            content = line.replace(\"\u003cHEAD\u003e\", \"\").replace(\"\u003c/HEAD\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"HEAD\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cDATELINE\u003e\"):","            content = line.replace(\"\u003cDATELINE\u003e\", \"\").replace(\"\u003c/DATELINE\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"DATELINE\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cTEXT\u003e\"):","            content = line.replace(\"\u003cTEXT\u003e\", \"\").replace(\"\u003c/TEXT\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"TEXT\\\"\u003e{content}\u003c/field\u003e\")","    doc_lines.append(\"  \u003c/doc\u003e\")  # Fin du document","    return doc_lines","","","# Parcourt tous les fichiers XML dans le dossier d'entrée","for filename in os.listdir(input_folder):","    if filename.endswith('.xml'):","        input_path = os.path.join(input_folder, filename)","        output_path = os.path.join(output_folder, filename)","","        try:","            # Détecte l'encodage","            encoding = detect_encoding(input_path)","            print(f\"Encodage détecté pour {filename}: {encoding}\")","","            # Lit le fichier avec l'encodage détecté","            with open(input_path, 'r', encoding=encoding, errors='ignore') as file:","                lines = file.readlines()","","            # Transforme et écrit dans le fichier de sortie","            with open(output_path, 'w', encoding='utf-8') as out_file:","                out_file.write(\"\u003cadd\u003e\\n\")  # Début de la racine Solr","                current_doc = []","                in_doc = False","                for line in lines:","                    if \"\u003cDOC\u003e\" in line:  # Début d'un document","                        in_doc = True","                        current_doc = []","                    elif \"\u003c/DOC\u003e\" in line:  # Fin d'un document","                        in_doc = False","                        # Transforme le document et l'écrit dans le fichier","                        transformed_doc = transform_document(current_doc)","                        out_file.write(\"\\n\".join(transformed_doc) + \"\\n\")","                    elif in_doc:","                        current_doc.append(line)","                out_file.write(\"\u003c/add\u003e\\n\")  # Fin de la racine Solr","","            print(f\"Fichier corrigé et converti : {output_path}\")","","        except Exception as e:","            print(f\"Erreur lors du traitement de {filename}: {e}\")","","\"\"\"","# Parcourt tous les fichiers du dossier d'entrée","for filename in os.listdir(input_folder):","    if filename.endswith('.gz'):  # Vérifie si le fichier est au format .gz","        input_path = os.path.join(input_folder, filename)","        output_path = os.path.join(output_folder, filename[:-3] + '.xml')  # Supprime '.gz' et ajoute '.xml'","","        # Décompresse le fichier et l'écrit directement avec l'extension .xml","        with gzip.open(input_path, 'rb') as f_in:","            with open(output_path, 'wb') as f_out:","                shutil.copyfileobj(f_in, f_out)","","        print(f\"Fichier extrait et enregistré en XML : {output_path}\")\"\"\"","","\"\"\"","","","","","# Creation des requetes longues","","# Liste des fichiers à traiter","files = ['Topics-requetes/topics.1-50.txt', 'Topics-requetes/topics.51-100.txt', 'Topics-requetes/topics.101-150.txt']","","# Dictionnaire pour stocker les résultats","def lire_fichier(filepath):","    with open(filepath, 'r', encoding='utf-8') as file:","        return file.read()","req_longues_combines = {}","","req_courtes_combines = {}","","for fichier in files:","    data = lire_fichier(fichier)","    resultat = extraire_requetes_longues(data)","    req_longues_combines.update(resultat)","","for fichier in files:","    data = lire_fichier(fichier)","    resultat = extraire_requetes_courtes(data)","    req_courtes_combines.update(resultat)","\"\"\"","# Afficher les résultats combinés","#print(resultats_combines)","","# Convertion en fichier json","","# Conversion en fichier JSON","\"\"\"","with open('requetes/requetes_longues.json', 'w', encoding='utf-8') as fichier_json:","    json.dump( req_longues_combines, fichier_json, ensure_ascii=False, indent=4)","","","# Creation des requetes courtes","","with open('requetes/requetes_courtes.json', 'w', encoding='utf-8') as fichier_json:","    json.dump( req_courtes_combines, fichier_json, ensure_ascii=False, indent=4)","","\"\"\""],"stylingDirectives":[[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,13,"pl-s1"]],[[0,4,"pl-k"],[5,13,"pl-s1"],[14,20,"pl-k"],[21,46,"pl-s1"],[48,73,"pl-s1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,13,"pl-s1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,13,"pl-s1"]],[],[[0,22,"pl-c"]],[[0,12,"pl-s1"],[13,14,"pl-c1"],[15,68,"pl-s"]],[[0,13,"pl-s1"],[14,15,"pl-c1"],[16,72,"pl-s"]],[],[[0,56,"pl-c"]],[[0,2,"pl-s1"],[3,11,"pl-c1"],[12,25,"pl-s1"],[27,35,"pl-s1"],[35,36,"pl-c1"],[36,40,"pl-c1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,14,"pl-s1"]],[],[[0,22,"pl-c"]],[[0,12,"pl-s1"],[13,14,"pl-c1"],[15,72,"pl-s"]],[[0,13,"pl-s1"],[14,15,"pl-c1"],[16,75,"pl-s"]],[],[[0,45,"pl-c"]],[[0,2,"pl-s1"],[3,11,"pl-c1"],[12,25,"pl-s1"],[27,35,"pl-s1"],[35,36,"pl-c1"],[36,40,"pl-c1"]],[],[],[[0,35,"pl-c"]],[[0,3,"pl-k"],[4,19,"pl-en"],[20,29,"pl-s1"]],[[4,8,"pl-k"],[9,13,"pl-en"],[14,23,"pl-s1"],[25,29,"pl-s"],[31,33,"pl-k"],[34,35,"pl-s1"]],[[8,14,"pl-s1"],[15,16,"pl-c1"],[17,24,"pl-s1"],[25,31,"pl-c1"],[32,33,"pl-s1"],[34,38,"pl-c1"]],[[8,14,"pl-k"],[15,21,"pl-s1"],[22,32,"pl-s"]],[],[],[[0,61,"pl-c"]],[[0,3,"pl-k"],[4,22,"pl-en"],[23,28,"pl-s1"]],[[4,13,"pl-s1"],[14,15,"pl-c1"]],[[4,13,"pl-s1"],[14,20,"pl-c1"],[21,30,"pl-s"],[33,52,"pl-c"]],[[4,7,"pl-k"],[8,12,"pl-s1"],[13,15,"pl-c1"],[16,21,"pl-s1"]],[[8,12,"pl-s1"],[13,14,"pl-c1"],[15,19,"pl-s1"],[20,25,"pl-c1"]],[[8,10,"pl-k"],[11,15,"pl-s1"],[16,26,"pl-c1"],[27,36,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,44,"pl-s"],[46,48,"pl-s"],[50,57,"pl-c1"],[58,68,"pl-s"],[70,72,"pl-s"],[74,79,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,75,"pl-s"],[47,49,"pl-cce"],[54,56,"pl-cce"],[57,66,"pl-s1"],[57,58,"pl-kos"],[58,65,"pl-s1"],[65,66,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,39,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,45,"pl-s"],[47,49,"pl-s"],[51,58,"pl-c1"],[59,70,"pl-s"],[72,74,"pl-s"],[76,81,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,76,"pl-s"],[47,49,"pl-cce"],[55,57,"pl-cce"],[58,67,"pl-s1"],[58,59,"pl-kos"],[59,66,"pl-s1"],[66,67,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,38,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,44,"pl-s"],[46,48,"pl-s"],[50,57,"pl-c1"],[58,68,"pl-s"],[70,72,"pl-s"],[74,79,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,75,"pl-s"],[47,49,"pl-cce"],[54,56,"pl-cce"],[57,66,"pl-s1"],[57,58,"pl-kos"],[58,65,"pl-s1"],[65,66,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,39,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,45,"pl-s"],[47,49,"pl-s"],[51,58,"pl-c1"],[59,70,"pl-s"],[72,74,"pl-s"],[76,81,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,76,"pl-s"],[47,49,"pl-cce"],[55,57,"pl-cce"],[58,67,"pl-s1"],[58,59,"pl-kos"],[59,66,"pl-s1"],[66,67,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,37,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,43,"pl-s"],[45,47,"pl-s"],[49,56,"pl-c1"],[57,66,"pl-s"],[68,70,"pl-s"],[72,77,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,74,"pl-s"],[47,49,"pl-cce"],[53,55,"pl-cce"],[56,65,"pl-s1"],[56,57,"pl-kos"],[57,64,"pl-s1"],[64,65,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,41,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,47,"pl-s"],[49,51,"pl-s"],[53,60,"pl-c1"],[61,74,"pl-s"],[76,78,"pl-s"],[80,85,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,78,"pl-s"],[47,49,"pl-cce"],[57,59,"pl-cce"],[60,69,"pl-s1"],[60,61,"pl-kos"],[61,68,"pl-s1"],[68,69,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,37,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,43,"pl-s"],[45,47,"pl-s"],[49,56,"pl-c1"],[57,66,"pl-s"],[68,70,"pl-s"],[72,77,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,74,"pl-s"],[47,49,"pl-cce"],[53,55,"pl-cce"],[56,65,"pl-s1"],[56,57,"pl-kos"],[57,64,"pl-s1"],[64,65,"pl-kos"]],[[4,13,"pl-s1"],[14,20,"pl-c1"],[21,31,"pl-s"],[34,51,"pl-c"]],[[4,10,"pl-k"],[11,20,"pl-s1"]],[],[],[[0,57,"pl-c"]],[[0,3,"pl-k"],[4,12,"pl-s1"],[13,15,"pl-c1"],[16,18,"pl-s1"],[19,26,"pl-c1"],[27,39,"pl-s1"]],[[4,6,"pl-k"],[7,15,"pl-s1"],[16,24,"pl-c1"],[25,31,"pl-s"]],[[8,18,"pl-s1"],[19,20,"pl-c1"],[21,23,"pl-s1"],[24,28,"pl-c1"],[29,33,"pl-c1"],[34,46,"pl-s1"],[48,56,"pl-s1"]],[[8,19,"pl-s1"],[20,21,"pl-c1"],[22,24,"pl-s1"],[25,29,"pl-c1"],[30,34,"pl-c1"],[35,48,"pl-s1"],[50,58,"pl-s1"]],[],[[8,11,"pl-k"]],[[12,32,"pl-c"]],[[12,20,"pl-s1"],[21,22,"pl-c1"],[23,38,"pl-en"],[39,49,"pl-s1"]],[[12,17,"pl-en"],[18,65,"pl-s"],[42,52,"pl-s1"],[42,43,"pl-kos"],[43,51,"pl-s1"],[51,52,"pl-kos"],[54,64,"pl-s1"],[54,55,"pl-kos"],[55,63,"pl-s1"],[63,64,"pl-kos"]],[],[[12,52,"pl-c"]],[[12,16,"pl-k"],[17,21,"pl-en"],[22,32,"pl-s1"],[34,37,"pl-s"],[39,47,"pl-s1"],[47,48,"pl-c1"],[48,56,"pl-s1"],[58,64,"pl-s1"],[64,65,"pl-c1"],[65,73,"pl-s"],[75,77,"pl-k"],[78,82,"pl-s1"]],[[16,21,"pl-s1"],[22,23,"pl-c1"],[24,28,"pl-s1"],[29,38,"pl-c1"]],[],[[12,59,"pl-c"]],[[12,16,"pl-k"],[17,21,"pl-en"],[22,33,"pl-s1"],[35,38,"pl-s"],[40,48,"pl-s1"],[48,49,"pl-c1"],[49,56,"pl-s"],[58,60,"pl-k"],[61,69,"pl-s1"]],[[16,24,"pl-s1"],[25,30,"pl-c1"],[31,40,"pl-s"],[37,39,"pl-cce"],[43,68,"pl-c"]],[[16,27,"pl-s1"],[28,29,"pl-c1"]],[[16,22,"pl-s1"],[23,24,"pl-c1"],[25,30,"pl-c1"]],[[16,19,"pl-k"],[20,24,"pl-s1"],[25,27,"pl-c1"],[28,33,"pl-s1"]],[[20,22,"pl-k"],[23,30,"pl-s"],[31,33,"pl-c1"],[34,38,"pl-s1"],[41,62,"pl-c"]],[[24,30,"pl-s1"],[31,32,"pl-c1"],[33,37,"pl-c1"]],[[24,35,"pl-s1"],[36,37,"pl-c1"]],[[20,24,"pl-k"],[25,33,"pl-s"],[34,36,"pl-c1"],[37,41,"pl-s1"],[44,63,"pl-c"]],[[24,30,"pl-s1"],[31,32,"pl-c1"],[33,38,"pl-c1"]],[[24,75,"pl-c"]],[[24,39,"pl-s1"],[40,41,"pl-c1"],[42,60,"pl-en"],[61,72,"pl-s1"]],[[24,32,"pl-s1"],[33,38,"pl-c1"],[39,43,"pl-s"],[40,42,"pl-cce"],[44,48,"pl-c1"],[49,64,"pl-s1"],[66,67,"pl-c1"],[68,72,"pl-s"],[69,71,"pl-cce"]],[[20,24,"pl-k"],[25,31,"pl-s1"]],[[24,35,"pl-s1"],[36,42,"pl-c1"],[43,47,"pl-s1"]],[[16,24,"pl-s1"],[25,30,"pl-c1"],[31,41,"pl-s"],[38,40,"pl-cce"],[44,67,"pl-c"]],[],[[12,17,"pl-en"],[18,64,"pl-s"],[50,63,"pl-s1"],[50,51,"pl-kos"],[51,62,"pl-s1"],[62,63,"pl-kos"]],[],[[8,14,"pl-k"],[15,24,"pl-v"],[25,27,"pl-k"],[28,29,"pl-s1"]],[[12,17,"pl-en"],[18,65,"pl-s"],[49,59,"pl-s1"],[49,50,"pl-kos"],[50,58,"pl-s1"],[58,59,"pl-kos"],[61,64,"pl-s1"],[61,62,"pl-kos"],[62,63,"pl-s1"],[63,64,"pl-kos"]],[],[[0,3,"pl-s"]],[[0,48,"pl-s"]],[[0,41,"pl-s"]],[[0,75,"pl-s"]],[[0,57,"pl-s"]],[[0,108,"pl-s"]],[[0,0,"pl-s"]],[[0,77,"pl-s"]],[[0,49,"pl-s"]],[[0,50,"pl-s"]],[[0,47,"pl-s"]],[[0,0,"pl-s"]],[[0,73,"pl-s"]],[],[[0,3,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,31,"pl-s"]],[[0,0,"pl-s"]],[[0,30,"pl-s"]],[[0,118,"pl-s"]],[[0,0,"pl-s"]],[[0,41,"pl-s"]],[[0,27,"pl-s"]],[[0,55,"pl-s"]],[[0,26,"pl-s"]],[[0,25,"pl-s"]],[[0,0,"pl-s"]],[[0,25,"pl-s"]],[[0,0,"pl-s"]],[[0,21,"pl-s"]],[[0,32,"pl-s"]],[[0,46,"pl-s"]],[[0,41,"pl-s"]],[[0,0,"pl-s"]],[[0,21,"pl-s"]],[[0,32,"pl-s"]],[[0,46,"pl-s"]],[[0,41,"pl-s"]],[[0,3,"pl-s"]],[[0,33,"pl-c"]],[[0,26,"pl-c"]],[],[[0,28,"pl-c"]],[],[[0,28,"pl-c"]],[[0,3,"pl-s"]],[[0,83,"pl-s"]],[[0,80,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,31,"pl-s"]],[[0,0,"pl-s"]],[[0,83,"pl-s"]],[[0,80,"pl-s"]],[[0,0,"pl-s"]],[[0,3,"pl-s"]],[],[]],"colorizedLines":null,"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/Alimiji/Solr_utilisation/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":false},"displayName":"extraire_fichiers.py","displayUrl":"https://github.com/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py?raw=true","headerInfo":{"blobSize":"6.04 KB","deleteTooltip":"Delete the file in your fork of this project","editTooltip":"Edit the file in your fork of this project","ghDesktopPath":"x-github-client://openRepo/https://github.com/Alimiji/Solr_utilisation?branch=main\u0026filepath=extraire_fichiers.py","isGitLfs":false,"onBranch":true,"shortPath":"b01fd91","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2FAlimiji%2FSolr_utilisation%2Fblob%2Fmain%2Fextraire_fichiers.py","isCSV":false,"isRichtext":false,"toc":null,"lineInfo":{"truncatedLoc":"171","truncatedSloc":"131"},"mode":"file"},"image":false,"isCodeownersFile":null,"isPlain":false,"isValidLegacyIssueTemplate":false,"issueTemplate":null,"discussionTemplate":null,"language":"Python","languageID":303,"large":false,"planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","releasePath":"/Alimiji/Solr_utilisation/releases/new?marketplace=true","showPublishActionBanner":false},"rawBlobUrl":"https://github.com/Alimiji/Solr_utilisation/raw/refs/heads/main/extraire_fichiers.py","renderImageOrRaw":false,"richText":null,"renderedFileInfo":null,"shortPath":null,"symbolsEnabled":true,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","actionsOnboardingTip":null},"truncated":false,"viewable":true,"workflowRedirectUrl":null,"symbols":{"timed_out":false,"not_analyzed":false,"symbols":[{"name":"input_folder","kind":"constant","ident_start":220,"ident_end":232,"extent_start":220,"extent_end":288,"fully_qualified_name":"input_folder","ident_utf16":{"start":{"line_number":15,"utf16_col":0},"end":{"line_number":15,"utf16_col":12}},"extent_utf16":{"start":{"line_number":15,"utf16_col":0},"end":{"line_number":15,"utf16_col":68}}},{"name":"output_folder","kind":"constant","ident_start":289,"ident_end":302,"extent_start":289,"extent_end":361,"fully_qualified_name":"output_folder","ident_utf16":{"start":{"line_number":16,"utf16_col":0},"end":{"line_number":16,"utf16_col":13}},"extent_utf16":{"start":{"line_number":16,"utf16_col":0},"end":{"line_number":16,"utf16_col":72}}},{"name":"input_folder","kind":"constant","ident_start":525,"ident_end":537,"extent_start":525,"extent_end":597,"fully_qualified_name":"input_folder","ident_utf16":{"start":{"line_number":27,"utf16_col":0},"end":{"line_number":27,"utf16_col":12}},"extent_utf16":{"start":{"line_number":27,"utf16_col":0},"end":{"line_number":27,"utf16_col":72}}},{"name":"output_folder","kind":"constant","ident_start":598,"ident_end":611,"extent_start":598,"extent_end":673,"fully_qualified_name":"output_folder","ident_utf16":{"start":{"line_number":28,"utf16_col":0},"end":{"line_number":28,"utf16_col":13}},"extent_utf16":{"start":{"line_number":28,"utf16_col":0},"end":{"line_number":28,"utf16_col":75}}},{"name":"detect_encoding","kind":"function","ident_start":807,"ident_end":822,"extent_start":803,"extent_end":947,"fully_qualified_name":"detect_encoding","ident_utf16":{"start":{"line_number":35,"utf16_col":4},"end":{"line_number":35,"utf16_col":19}},"extent_utf16":{"start":{"line_number":35,"utf16_col":0},"end":{"line_number":38,"utf16_col":33}}},{"name":"transform_document","kind":"function","ident_start":1016,"ident_end":1034,"extent_start":1012,"extent_end":2649,"fully_qualified_name":"transform_document","ident_utf16":{"start":{"line_number":42,"utf16_col":4},"end":{"line_number":42,"utf16_col":22}},"extent_utf16":{"start":{"line_number":42,"utf16_col":0},"end":{"line_number":69,"utf16_col":20}}}]}},"copilotInfo":null,"copilotAccessAllowed":true,"modelsAccessAllowed":false,"csrf_tokens":{"/Alimiji/Solr_utilisation/branches":{"post":"u-AIegMwVHgrZih-2iQMaFPTafhn0fo66DlWp0BaSXgG20lvy-QDH-psuH269hin7cACmCFXZycmUhZp0Jyz9A"},"/repos/preferences":{"post":"qbpPXnF_H2OtYY1ieaCfX3EOKuhFgX6re0kxJYUXy1Z67sVArhAiFcU2p8r_MZH-sNHzpRdoF3XP547gy6wrgg"}}},"title":"Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation","appPayload":{"helpUrl":"https://docs.github.com","findFileWorkerPath":"/assets-cdn/worker/find-file-worker-7d7eb7c71814.js","findInFileWorkerPath":"/assets-cdn/worker/find-in-file-worker-96e76d5fdb2c.js","githubDevUrl":"https://github.dev/","enabled_features":{"code_nav_ui_events":false,"overview_shared_code_dropdown_button":false,"react_blob_overlay":true,"copilot_conversational_ux_embedding_update":false,"copilot_smell_icebreaker_ux":true,"accessible_code_button":true}}}</script>
  <div data-target="react-app.reactRoot"><style data-styled="true" data-styled-version="5.3.11">.hOfjFo{padding:0;}/*!sc*/
.oDGAe{max-width:100%;margin-left:auto;margin-right:auto;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;}/*!sc*/
.kowOcT{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex:1 1 100%;-ms-flex:1 1 100%;flex:1 1 100%;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;max-width:100%;}/*!sc*/
.gISSDQ{width:100%;}/*!sc*/
@media screen and (min-width:544px){.gISSDQ{width:100%;}}/*!sc*/
@media screen and (min-width:768px){.gISSDQ{width:auto;}}/*!sc*/
.fHCyST{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-order:1;-ms-flex-order:1;order:1;width:100%;margin-left:0;margin-right:0;-webkit-flex-direction:column-reverse;-ms-flex-direction:column-reverse;flex-direction:column-reverse;margin-bottom:0;min-width:0;}/*!sc*/
@media screen and (min-width:768px){.fHCyST{width:auto;margin-top:0 !important;margin-bottom:0 !important;position:-webkit-sticky;position:sticky;top:0px;max-height:100vh !important;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;margin-right:0;height:100vh;}}/*!sc*/
@media print,screen and (max-width:1349px) and (min-width:768px){.fHCyST{display:none;}}/*!sc*/
.hPvFuC{margin-left:0;margin-right:0;display:none;margin-top:0;}/*!sc*/
@media screen and (min-width:768px){.hPvFuC{margin-left:0 !important;margin-right:0 !important;}}/*!sc*/
.fFSoPl{--pane-min-width:256px;--pane-max-width-diff:511px;--pane-max-width:calc(100vw - var(--pane-max-width-diff));width:100%;padding:0;}/*!sc*/
@media screen and (min-width:544px){}/*!sc*/
@media screen and (min-width:768px){.fFSoPl{width:clamp(var(--pane-min-width),var(--pane-width),var(--pane-max-width));overflow:auto;}}/*!sc*/
@media screen and (min-width:1280px){.fFSoPl{--pane-max-width-diff:959px;}}/*!sc*/
.birIjn{max-height:100%;height:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;}/*!sc*/
@media screen and (max-width:768px){.birIjn{display:none;}}/*!sc*/
@media screen and (min-width:768px){.birIjn{max-height:100vh;height:100vh;}}/*!sc*/
.hNNCwk{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;padding-left:16px;padding-right:16px;padding-bottom:8px;padding-top:16px;}/*!sc*/
.jfIeyl{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;margin-bottom:16px;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.XosP{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;font-size:14px;}/*!sc*/
.hMLRgO[data-size="medium"]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));padding-left:8px;padding-right:8px;display:none;}/*!sc*/
@media screen and (max-width:768px){.hMLRgO[data-size="medium"]{display:block;}}/*!sc*/
.gUkoLg{-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;}/*!sc*/
.kOkWgo{font-size:16px;margin-left:8px;}/*!sc*/
.lhbroM{margin-left:24px;margin-right:24px;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;}/*!sc*/
.khzwtX{-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;}/*!sc*/
.JMXqM[data-size="medium"]{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;min-width:0;}/*!sc*/
.JMXqM[data-size="medium"] svg{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.JMXqM[data-size="medium"] > span{width:inherit;}/*!sc*/
.bZBlpz{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;}/*!sc*/
.bJjzmO{margin-right:4px;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.ffLUq{font-size:14px;min-width:0;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;}/*!sc*/
.eTeVqd{margin-left:8px;white-space:nowrap;}/*!sc*/
.eTeVqd:hover button:not(:hover){border-left-color:var(--button-default-borderColor-hover,var(--color-btn-hover-border));}/*!sc*/
.jNHrPP[data-size="medium"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-subtle,#6e7681));border-top-right-radius:0;border-bottom-right-radius:0;border-right:0;}/*!sc*/
.ijefGF[data-size="medium"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-subtle,#6e7681));font-size:14px;font-weight:400;-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;border-top-left-radius:0;border-bottom-left-radius:0;}/*!sc*/
.ftzGWg{margin-left:16px;margin-right:16px;margin-bottom:12px;}/*!sc*/
@media screen and (max-width:768px){.ftzGWg{display:none;}}/*!sc*/
.dItACB{margin-right:-6px;}/*!sc*/
.gjtfVk{-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;max-height:100% !important;overflow-y:auto;-webkit-scrollbar-gutter:stable;-moz-scrollbar-gutter:stable;-ms-scrollbar-gutter:stable;scrollbar-gutter:stable;}/*!sc*/
@media screen and (max-width:768px){.gjtfVk{display:none;}}/*!sc*/
.cOxzdh{padding-left:16px;padding-right:16px;padding-bottom:8px;}/*!sc*/
.bTBnTW{height:100%;position:relative;display:none;margin-left:0;}/*!sc*/
.fFMzrG{position:absolute;inset:0 -2px;cursor:col-resize;background-color:transparent;-webkit-transition-delay:0.1s;transition-delay:0.1s;}/*!sc*/
.fFMzrG:hover{background-color:var(--bgColor-neutral-muted,var(--color-neutral-muted,rgba(110,118,129,0.4)));}/*!sc*/
.iKqMNA{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-order:2;-ms-flex-order:2;order:2;-webkit-flex-basis:0;-ms-flex-preferred-size:0;flex-basis:0;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;min-width:1px;margin-right:auto;}/*!sc*/
@media print{.iKqMNA{display:-webkit-box !important;display:-webkit-flex !important;display:-ms-flexbox !important;display:flex !important;}}/*!sc*/
.FxAyp{width:100%;max-width:100%;margin-left:auto;margin-right:auto;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;padding:0;}/*!sc*/
.leYMvG{margin-left:auto;margin-right:auto;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;padding-bottom:40px;max-width:100%;margin-top:0;}/*!sc*/
.KMPzq{display:inherit;}/*!sc*/
.hfKjHv{width:100%;}/*!sc*/
.gZWyZE{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;gap:8px;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;width:100%;}/*!sc*/
.dwYKDk{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:start;-webkit-box-align:start;-ms-flex-align:start;align-items:start;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;gap:8px;}/*!sc*/
.iDtIiT{-webkit-align-self:center;-ms-flex-item-align:center;align-self:center;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;padding-right:8px;min-width:0;}/*!sc*/
.cEytCf{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;font-size:16px;min-width:0;-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;max-width:100%;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.fzFXnm{max-width:100%;}/*!sc*/
.iMnkmv{max-width:100%;list-style:none;display:inline-block;}/*!sc*/
.ghzDag{display:inline-block;max-width:100%;}/*!sc*/
.kHuKdh{font-weight:600;}/*!sc*/
.jGhzSQ{font-weight:600;display:inline-block;max-width:100%;font-size:16px;}/*!sc*/
.faNtbn{min-height:32px;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:start;-webkit-box-align:start;-ms-flex-align:start;align-items:start;}/*!sc*/
.dwNhzn[data-size="medium"][data-no-visuals]{border-top-left-radius:0;border-bottom-left-radius:0;display:none;}/*!sc*/
.kVRliy[data-size="medium"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.dJxjrT{margin-left:16px;margin-right:16px;}/*!sc*/
.eFxKDQ{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;}/*!sc*/
.dzCJzi{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;gap:8px;min-width:273px;padding:8px;}/*!sc*/
@media screen and (min-width:544px){.dzCJzi{-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;}}/*!sc*/
.ldRxiI{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;}/*!sc*/
.efRoCL{width:100%;height:-webkit-fit-content;height:-moz-fit-content;height:fit-content;min-width:0;margin-right:16px;}/*!sc*/
.gNAmSV{height:40px;padding-left:4px;padding-bottom:16px;}/*!sc*/
.jNEwzY{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.ifyOQK{font-size:12px;-webkit-flex:auto;-ms-flex:auto;flex:auto;padding-right:16px;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));min-width:0;}/*!sc*/
.jdLMhu{top:0px;z-index:4;background:var(--bgColor-default,var(--color-canvas-default));position:-webkit-sticky;position:sticky;}/*!sc*/
.tOISc{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;width:100%;position:absolute;}/*!sc*/
.hqwSEx{display:none;min-width:0;padding-top:8px;padding-bottom:8px;}/*!sc*/
.lzKZY{margin-right:8px;margin-left:16px;text-overflow:ellipsis;overflow:hidden;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;width:100%;}/*!sc*/
.fHind{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;font-size:14px;min-width:0;-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;max-width:100%;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.dnZoUW{font-weight:600;display:inline-block;max-width:100%;font-size:14px;}/*!sc*/
.dpNnZU[data-size="small"]{color:var(--fgColor-default,var(--color-fg-default,#e6edf3));margin-left:8px;}/*!sc*/
.gpHFJV{padding-left:8px;padding-top:8px;padding-bottom:8px;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex:1;-ms-flex:1;flex:1;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;background-color:var(--bgColor-muted,var(--color-canvas-subtle,#161b22));border:1px solid var(--borderColor-default,var(--color-border-default));border-radius:6px 6px 0px 0px;}/*!sc*/
.iNMjfP{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;gap:8px;min-width:0;}/*!sc*/
.fefCSX{display:block;position:relative;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;margin-top:-1px;margin-bottom:-1px;--separator-color:transparent;}/*!sc*/
.fefCSX:not(:last-child){margin-right:1px;}/*!sc*/
.fefCSX:not(:last-child):after{background-color:var(--separator-color);content:"";position:absolute;right:-2px;top:8px;bottom:8px;width:1px;}/*!sc*/
.fefCSX:focus-within:has(:focus-visible){--separator-color:transparent;}/*!sc*/
.fefCSX:first-child{margin-left:-1px;}/*!sc*/
.fefCSX:last-child{margin-right:-1px;}/*!sc*/
.sulSy{display:block;position:relative;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;margin-top:-1px;margin-bottom:-1px;--separator-color:var(--borderColor-default,var(--color-border-default,#30363d));}/*!sc*/
.sulSy:not(:last-child){margin-right:1px;}/*!sc*/
.sulSy:not(:last-child):after{background-color:var(--separator-color);content:"";position:absolute;right:-2px;top:8px;bottom:8px;width:1px;}/*!sc*/
.sulSy:focus-within:has(:focus-visible){--separator-color:transparent;}/*!sc*/
.sulSy:first-child{margin-left:-1px;}/*!sc*/
.sulSy:last-child{margin-right:-1px;}/*!sc*/
.kcLCKF{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;gap:8px;margin-right:8px;}/*!sc*/
.kVWtTz{gap:8px;}/*!sc*/
.gWqxTd{padding-left:8px;padding-right:8px;}/*!sc*/
.gWqxTd linkButtonSx:hover:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.gWqxTd linkButtonSx:focus:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.gWqxTd linkButtonSx:active:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.ivobqY[data-size="small"][data-no-visuals]{border-top-left-radius:0;border-bottom-left-radius:0;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]{border-top-right-radius:0;border-bottom-right-radius:0;border-right-width:0;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]:hover:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]:focus:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]:active:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.hySUEo[data-size="small"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));position:relative;}/*!sc*/
.itGLhU[data-size="small"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.hycJXc{border:1px solid;border-top:none;border-color:var(--borderColor-default,var(--color-border-default,#30363d));border-radius:0px 0px 6px 6px;min-width:273px;}/*!sc*/
.dceWRL{background-color:var(--bgColor-default,var(--color-canvas-default));border:0px;border-width:0;border-radius:0px 0px 6px 6px;padding:0;min-width:0;margin-top:46px;}/*!sc*/
.dGXHv{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex:1;-ms-flex:1;flex:1;padding-top:8px;padding-bottom:8px;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;min-width:0;position:relative;}/*!sc*/
.bpDFns{position:relative;}/*!sc*/
.iJOeCH{-webkit-flex:1;-ms-flex:1;flex:1;position:relative;min-width:0;}/*!sc*/
.jewUnv{tab-size:8;isolation:isolate;position:relative;overflow:auto;max-width:unset;}/*!sc*/
.cJGaMs{margin:1px 8px;position:absolute;z-index:1;}/*!sc*/
.iGLarr{position:absolute;}/*!sc*/
.mgQhK{padding-bottom:33px;}/*!sc*/
.ipeRWy{background-color:var(--bgColor-default,var(--color-canvas-default,#0d1117));border:1px solid;border-color:var(--borderColor-default,var(--color-border-default,#30363d));border-radius:6px;contain:paint;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;height:100%;min-height:0;max-height:100vh;overflow-y:auto;right:0;top:0px;z-index:4;background:var(--bgColor-default,var(--color-canvas-default));position:-webkit-sticky;position:sticky;}/*!sc*/
.cxUsTr{padding-top:8px;padding-bottom:8px;padding-left:16px;padding-right:16px;}/*!sc*/
.jXkPPw{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;}/*!sc*/
.hECgeo{font-size:14px;-webkit-order:1;-ms-flex-order:1;order:1;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;font-weight:600;}/*!sc*/
.fotqAA[data-size="medium"][data-no-visuals]{-webkit-order:3;-ms-flex-order:3;order:3;color:var(--fgColor-default,var(--color-fg-default,#e6edf3));margin-right:-8px;}/*!sc*/
.hoyhab{font-size:12px;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));padding-top:8px;}/*!sc*/
.gqhZpQ{margin-right:6px;}/*!sc*/
.ccgkJf{margin-left:-16px;margin-bottom:-8px;}/*!sc*/
.kACRto{margin-bottom:-8px;overflow-y:auto;max-height:calc(100vh - 237px);padding-left:16px;padding-bottom:8px;padding-top:4px;}/*!sc*/
.cSURfY{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;}/*!sc*/
.bTXewe{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;position:relative;margin-right:8px;}/*!sc*/
.dotKsF{background-color:var(--color-prettylights-syntax-variable,#ffa657);opacity:0.1;position:absolute;border-radius:5px;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;height:100%;}/*!sc*/
.iGIwaf{color:var(--color-prettylights-syntax-variable,#ffa657);border-radius:5px;font-weight:600;font-size:smaller;padding-left:4px;padding-right:4px;padding-top:1px;padding-bottom:1px;}/*!sc*/
.gxAxAi{background-color:var(--color-prettylights-syntax-entity,#d2a8ff);opacity:0.1;position:absolute;border-radius:5px;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;height:100%;}/*!sc*/
.gWkFIQ{color:var(--color-prettylights-syntax-entity,#d2a8ff);border-radius:5px;font-weight:600;font-size:smaller;padding-left:4px;padding-right:4px;padding-top:1px;padding-bottom:1px;}/*!sc*/
.cCoXib{position:fixed;top:0;right:0;height:100%;width:15px;-webkit-transition:-webkit-transform 0.3s;-webkit-transition:transform 0.3s;transition:transform 0.3s;z-index:1;}/*!sc*/
.cCoXib:hover{-webkit-transform:scaleX(1.5);-ms-transform:scaleX(1.5);transform:scaleX(1.5);}/*!sc*/
data-styled.g1[id="Box-sc-g0xbh4-0"]{content:"hOfjFo,oDGAe,kowOcT,gISSDQ,fHCyST,hPvFuC,fFSoPl,birIjn,hNNCwk,jfIeyl,XosP,hMLRgO,gUkoLg,kOkWgo,lhbroM,khzwtX,JMXqM,bZBlpz,bJjzmO,ffLUq,eTeVqd,jNHrPP,ijefGF,ftzGWg,dItACB,gjtfVk,cOxzdh,bTBnTW,fFMzrG,iKqMNA,FxAyp,leYMvG,KMPzq,hfKjHv,gZWyZE,dwYKDk,iDtIiT,cEytCf,fzFXnm,iMnkmv,ghzDag,kHuKdh,jGhzSQ,faNtbn,dwNhzn,kVRliy,dJxjrT,eFxKDQ,dzCJzi,ldRxiI,efRoCL,gNAmSV,jNEwzY,ifyOQK,jdLMhu,tOISc,hqwSEx,lzKZY,fHind,dnZoUW,dpNnZU,gpHFJV,iNMjfP,fefCSX,sulSy,kcLCKF,kVWtTz,gWqxTd,ivobqY,kilKoS,hySUEo,itGLhU,hycJXc,dceWRL,dGXHv,bpDFns,iJOeCH,jewUnv,cJGaMs,iGLarr,mgQhK,ipeRWy,cxUsTr,jXkPPw,hECgeo,fotqAA,hoyhab,gqhZpQ,ccgkJf,kACRto,cSURfY,bTXewe,dotKsF,iGIwaf,gxAxAi,gWkFIQ,cCoXib,"}/*!sc*/
.eMMFM{min-width:0;}/*!sc*/
.eMMFM:where([data-size='small']){font-size:var(--text-body-size-small,0.75rem);line-height:var(--text-body-lineHeight-small,1.6666);}/*!sc*/
.eMMFM:where([data-size='medium']){font-size:var(--text-body-size-medium,0.875rem);line-height:var(--text-body-lineHeight-medium,1.4285);}/*!sc*/
.eMMFM:where([data-size='large']){font-size:var(--text-body-size-large,1rem);line-height:var(--text-body-lineHeight-large,1.5);}/*!sc*/
.eMMFM:where([data-weight='light']){font-weight:var(--base-text-weight-light,300);}/*!sc*/
.eMMFM:where([data-weight='normal']){font-weight:var(--base-text-weight-normal,400);}/*!sc*/
.eMMFM:where([data-weight='medium']){font-weight:var(--base-text-weight-medium,500);}/*!sc*/
.eMMFM:where([data-weight='semibold']){font-weight:var(--base-text-weight-semibold,600);}/*!sc*/
.iHQnrN{padding-left:4px;padding-right:4px;font-weight:400;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));font-size:16px;}/*!sc*/
.iHQnrN:where([data-size='small']){font-size:var(--text-body-size-small,0.75rem);line-height:var(--text-body-lineHeight-small,1.6666);}/*!sc*/
.iHQnrN:where([data-size='medium']){font-size:var(--text-body-size-medium,0.875rem);line-height:var(--text-body-lineHeight-medium,1.4285);}/*!sc*/
.iHQnrN:where([data-size='large']){font-size:var(--text-body-size-large,1rem);line-height:var(--text-body-lineHeight-large,1.5);}/*!sc*/
.iHQnrN:where([data-weight='light']){font-weight:var(--base-text-weight-light,300);}/*!sc*/
.iHQnrN:where([data-weight='normal']){font-weight:var(--base-text-weight-normal,400);}/*!sc*/
.iHQnrN:where([data-weight='medium']){font-weight:var(--base-text-weight-medium,500);}/*!sc*/
.iHQnrN:where([data-weight='semibold']){font-weight:var(--base-text-weight-semibold,600);}/*!sc*/
.wcuBT{padding-left:4px;padding-right:4px;font-weight:400;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));font-size:14px;}/*!sc*/
.wcuBT:where([data-size='small']){font-size:var(--text-body-size-small,0.75rem);line-height:var(--text-body-lineHeight-small,1.6666);}/*!sc*/
.wcuBT:where([data-size='medium']){font-size:var(--text-body-size-medium,0.875rem);line-height:var(--text-body-lineHeight-medium,1.4285);}/*!sc*/
.wcuBT:where([data-size='large']){font-size:var(--text-body-size-large,1rem);line-height:var(--text-body-lineHeight-large,1.5);}/*!sc*/
.wcuBT:where([data-weight='light']){font-weight:var(--base-text-weight-light,300);}/*!sc*/
.wcuBT:where([data-weight='normal']){font-weight:var(--base-text-weight-normal,400);}/*!sc*/
.wcuBT:where([data-weight='medium']){font-weight:var(--base-text-weight-medium,500);}/*!sc*/
.wcuBT:where([data-weight='semibold']){font-weight:var(--base-text-weight-semibold,600);}/*!sc*/
data-styled.g5[id="Text__StyledText-sc-17v1xeu-0"]{content:"eMMFM,iHQnrN,wcuBT,"}/*!sc*/
.brGdpi{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;-webkit-clip:rect(0,0,0,0);clip:rect(0,0,0,0);white-space:nowrap;border-width:0;}/*!sc*/
data-styled.g6[id="_VisuallyHidden__VisuallyHidden-sc-11jhm7a-0"]{content:"brGdpi,"}/*!sc*/
.jkNcAv{border:0;font-size:inherit;font-family:inherit;background-color:transparent;-webkit-appearance:none;color:inherit;width:100%;}/*!sc*/
.jkNcAv:focus{outline:0;}/*!sc*/
data-styled.g13[id="UnstyledTextInput__ToggledUnstyledTextInput-sc-14ypya-0"]{content:"jkNcAv,"}/*!sc*/
.bclhiL{font-size:14px;line-height:var(--base-size-20);color:var(--fgColor-default,var(--color-fg-default,#e6edf3));vertical-align:middle;background-color:var(--bgColor-default,var(--color-canvas-default,#0d1117));border:1px solid var(--control-borderColor-rest,var(--borderColor-default,var(--color-border-default,#30363d)));border-radius:6px;outline:none;box-shadow:var(--shadow-inset,var(--color-primer-shadow-inset,0 0 transparent));display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;min-height:var(--base-size-32);overflow:hidden;--inner-action-size:var(--base-size-24);}/*!sc*/
.bclhiL input,.bclhiL textarea{cursor:text;}/*!sc*/
.bclhiL select{cursor:pointer;}/*!sc*/
.bclhiL input::-webkit-input-placeholder,.bclhiL textarea::-webkit-input-placeholder,.bclhiL select::-webkit-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL input::-moz-placeholder,.bclhiL textarea::-moz-placeholder,.bclhiL select::-moz-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL input:-ms-input-placeholder,.bclhiL textarea:-ms-input-placeholder,.bclhiL select:-ms-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL input::placeholder,.bclhiL textarea::placeholder,.bclhiL select::placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL:where([data-trailing-action][data-focused]),.bclhiL:where(:not([data-trailing-action]):focus-within){border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.bclhiL > textarea{padding:var(--base-size-12);}/*!sc*/
.bclhiL:where([data-contrast]){background-color:var(--bgColor-inset,var(--color-canvas-inset,#010409));}/*!sc*/
.bclhiL:where([data-disabled]){color:var(--fgColor-disabled,var(--color-primer-fg-disabled,#484f58));background-color:var(--control-bgColor-disabled,var(--color-input-disabled-bg,rgba(110,118,129,0)));box-shadow:none;border-color:var(--control-borderColor-disabled,var(--borderColor-default,var(--color-border-default,#30363d)));}/*!sc*/
.bclhiL:where([data-disabled]) input,.bclhiL:where([data-disabled]) textarea,.bclhiL:where([data-disabled]) select{cursor:not-allowed;}/*!sc*/
.bclhiL:where([data-monospace]){font-family:var(--fontStack-monospace,SFMono-Regular,Consolas,"Liberation Mono",Menlo,Courier,monospace);}/*!sc*/
.bclhiL:where([data-validation='error']){border-color:var(--borderColor-danger-emphasis,var(--color-danger-emphasis,#da3633));}/*!sc*/
.bclhiL:where([data-validation='error']):where([data-trailing-action][data-focused]),.bclhiL:where([data-validation='error']):where(:not([data-trailing-action])):focus-within{border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.bclhiL:where([data-validation='success']){border-color:var(--bgColor-success-emphasis,var(--color-success-emphasis,#238636));}/*!sc*/
.bclhiL:where([data-block]){width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-self:stretch;-ms-flex-item-align:stretch;align-self:stretch;}/*!sc*/
@media (min-width:768px){.bclhiL{font-size:var(--text-body-size-medium);}}/*!sc*/
.bclhiL:where([data-size='small']){--inner-action-size:var(--base-size-20);min-height:var(--base-size-28);padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:var(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.bclhiL:where([data-size='large']){--inner-action-size:var(--base-size-28);height:var(--base-size-40);padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);}/*!sc*/
.bclhiL:where([data-variant='small']){min-height:28px;padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.bclhiL:where([data-variant='large']){padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);font-size:var(--text-title-size-medium);}/*!sc*/
.bclhiL{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;min-width:160px;}/*!sc*/
.zEBjf{font-size:14px;line-height:var(--base-size-20);color:var(--fgColor-default,var(--color-fg-default,#e6edf3));vertical-align:middle;background-color:var(--bgColor-default,var(--color-canvas-default,#0d1117));border:1px solid var(--control-borderColor-rest,var(--borderColor-default,var(--color-border-default,#30363d)));border-radius:6px;outline:none;box-shadow:var(--shadow-inset,var(--color-primer-shadow-inset,0 0 transparent));display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;min-height:var(--base-size-32);overflow:hidden;--inner-action-size:var(--base-size-24);}/*!sc*/
.zEBjf input,.zEBjf textarea{cursor:text;}/*!sc*/
.zEBjf select{cursor:pointer;}/*!sc*/
.zEBjf input::-webkit-input-placeholder,.zEBjf textarea::-webkit-input-placeholder,.zEBjf select::-webkit-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf input::-moz-placeholder,.zEBjf textarea::-moz-placeholder,.zEBjf select::-moz-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf input:-ms-input-placeholder,.zEBjf textarea:-ms-input-placeholder,.zEBjf select:-ms-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf input::placeholder,.zEBjf textarea::placeholder,.zEBjf select::placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf:where([data-trailing-action][data-focused]),.zEBjf:where(:not([data-trailing-action]):focus-within){border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.zEBjf > textarea{padding:var(--base-size-12);}/*!sc*/
.zEBjf:where([data-contrast]){background-color:var(--bgColor-inset,var(--color-canvas-inset,#010409));}/*!sc*/
.zEBjf:where([data-disabled]){color:var(--fgColor-disabled,var(--color-primer-fg-disabled,#484f58));background-color:var(--control-bgColor-disabled,var(--color-input-disabled-bg,rgba(110,118,129,0)));box-shadow:none;border-color:var(--control-borderColor-disabled,var(--borderColor-default,var(--color-border-default,#30363d)));}/*!sc*/
.zEBjf:where([data-disabled]) input,.zEBjf:where([data-disabled]) textarea,.zEBjf:where([data-disabled]) select{cursor:not-allowed;}/*!sc*/
.zEBjf:where([data-monospace]){font-family:var(--fontStack-monospace,SFMono-Regular,Consolas,"Liberation Mono",Menlo,Courier,monospace);}/*!sc*/
.zEBjf:where([data-validation='error']){border-color:var(--borderColor-danger-emphasis,var(--color-danger-emphasis,#da3633));}/*!sc*/
.zEBjf:where([data-validation='error']):where([data-trailing-action][data-focused]),.zEBjf:where([data-validation='error']):where(:not([data-trailing-action])):focus-within{border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.zEBjf:where([data-validation='success']){border-color:var(--bgColor-success-emphasis,var(--color-success-emphasis,#238636));}/*!sc*/
.zEBjf:where([data-block]){width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-self:stretch;-ms-flex-item-align:stretch;align-self:stretch;}/*!sc*/
@media (min-width:768px){.zEBjf{font-size:var(--text-body-size-medium);}}/*!sc*/
.zEBjf:where([data-size='small']){--inner-action-size:var(--base-size-20);min-height:var(--base-size-28);padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:var(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.zEBjf:where([data-size='large']){--inner-action-size:var(--base-size-28);height:var(--base-size-40);padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);}/*!sc*/
.zEBjf:where([data-variant='small']){min-height:28px;padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.zEBjf:where([data-variant='large']){padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);font-size:var(--text-title-size-medium);}/*!sc*/
.zEBjf{margin-top:8px;border-radius:6px;}/*!sc*/
data-styled.g14[id="TextInputWrapper__StyledTextInputBaseWrapper-sc-1mqhpbi-0"]{content:"bclhiL,zEBjf,"}/*!sc*/
.jpROxA{background-repeat:no-repeat;background-position:right 8px center;padding-right:0;padding-left:0;}/*!sc*/
.jpROxA > :not(:last-child){margin-right:8px;}/*!sc*/
.jpROxA .TextInput-icon,.jpROxA .TextInput-action{-webkit-align-self:center;-ms-flex-item-align:center;align-self:center;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;}/*!sc*/
.jpROxA > input,.jpROxA > select{padding-right:0;padding-left:0;}/*!sc*/
.jpROxA:where([data-leading-visual]){padding-left:var(--base-size-12);}/*!sc*/
.jpROxA:where([data-trailing-visual]:not([data-trailing-action])){padding-right:var(--base-size-12);}/*!sc*/
.jpROxA:where(:not([data-leading-visual])) > input,.jpROxA:where(:not([data-leading-visual])) > select{padding-left:var(--base-size-12);}/*!sc*/
.jpROxA:where(:not([data-trailing-visual]):not([data-trailing-action])) > input,.jpROxA:where(:not([data-trailing-visual]):not([data-trailing-action])) > select{padding-right:var(--base-size-12);}/*!sc*/
.jpROxA{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;min-width:160px;}/*!sc*/
.kOcqDw{background-repeat:no-repeat;background-position:right 8px center;padding-right:0;padding-left:0;}/*!sc*/
.kOcqDw > :not(:last-child){margin-right:8px;}/*!sc*/
.kOcqDw .TextInput-icon,.kOcqDw .TextInput-action{-webkit-align-self:center;-ms-flex-item-align:center;align-self:center;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;}/*!sc*/
.kOcqDw > input,.kOcqDw > select{padding-right:0;padding-left:0;}/*!sc*/
.kOcqDw:where([data-leading-visual]){padding-left:var(--base-size-12);}/*!sc*/
.kOcqDw:where([data-trailing-visual]:not([data-trailing-action])){padding-right:var(--base-size-12);}/*!sc*/
.kOcqDw:where(:not([data-leading-visual])) > input,.kOcqDw:where(:not([data-leading-visual])) > select{padding-left:var(--base-size-12);}/*!sc*/
.kOcqDw:where(:not([data-trailing-visual]):not([data-trailing-action])) > input,.kOcqDw:where(:not([data-trailing-visual]):not([data-trailing-action])) > select{padding-right:var(--base-size-12);}/*!sc*/
.kOcqDw{margin-top:8px;border-radius:6px;}/*!sc*/
data-styled.g15[id="TextInputWrapper__StyledTextInputWrapper-sc-1mqhpbi-1"]{content:"jpROxA,kOcqDw,"}/*!sc*/
.fLAhLl{display:none;}/*!sc*/
.fLAhLl[popover]{position:absolute;padding:0.5em 0.75em;width:-webkit-max-content;width:-moz-max-content;width:max-content;margin:auto;-webkit-clip:auto;clip:auto;white-space:normal;font:normal normal 11px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";-webkit-font-smoothing:subpixel-antialiased;color:var(--tooltip-fgColor,var(--fgColor-onEmphasis,var(--color-fg-on-emphasis,#ffffff)));text-align:center;word-wrap:break-word;background:var(--tooltip-bgColor,var(--bgColor-emphasis,var(--color-neutral-emphasis-plus,#6e7681)));border-radius:6px;border:0;opacity:0;max-width:250px;inset:auto;overflow:visible;}/*!sc*/
.fLAhLl[popover]:popover-open{display:block;}/*!sc*/
.fLAhLl[popover].\:popover-open{display:block;}/*!sc*/
@media (forced-colors:active){.fLAhLl{outline:1px solid transparent;}}/*!sc*/
.fLAhLl::after{position:absolute;display:block;right:0;left:0;height:var(--overlay-offset,0.25rem);content:'';}/*!sc*/
.fLAhLl[data-direction='n']::after,.fLAhLl[data-direction='ne']::after,.fLAhLl[data-direction='nw']::after{top:100%;}/*!sc*/
.fLAhLl[data-direction='s']::after,.fLAhLl[data-direction='se']::after,.fLAhLl[data-direction='sw']::after{bottom:100%;}/*!sc*/
.fLAhLl[data-direction='w']::after{position:absolute;display:block;height:100%;width:8px;content:'';bottom:0;left:100%;}/*!sc*/
.fLAhLl[data-direction='e']::after{position:absolute;display:block;height:100%;width:8px;content:'';bottom:0;right:100%;margin-left:-8px;}/*!sc*/
@-webkit-keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
@keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
.fLAhLl:popover-open,.fLAhLl:popover-open::before{-webkit-animation-name:tooltip-appear;animation-name:tooltip-appear;-webkit-animation-duration:0.1s;animation-duration:0.1s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in;-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
.fLAhLl.\:popover-open,.fLAhLl.\:popover-open::before{-webkit-animation-name:tooltip-appear;animation-name:tooltip-appear;-webkit-animation-duration:0.1s;animation-duration:0.1s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in;-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
data-styled.g16[id="Tooltip__StyledTooltip-sc-e45c7z-0"]{content:"fLAhLl,"}/*!sc*/
.fiSvBN{position:relative;display:inline-block;}/*!sc*/
.fiSvBN::after{position:absolute;z-index:1000000;display:none;padding:0.5em 0.75em;font:normal normal 11px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";-webkit-font-smoothing:subpixel-antialiased;color:var(--tooltip-fgColor,var(--fgColor-onEmphasis,var(--color-fg-on-emphasis,#ffffff)));text-align:center;-webkit-text-decoration:none;text-decoration:none;text-shadow:none;text-transform:none;-webkit-letter-spacing:normal;-moz-letter-spacing:normal;-ms-letter-spacing:normal;letter-spacing:normal;word-wrap:break-word;white-space:pre;pointer-events:none;content:attr(aria-label);background:var(--tooltip-bgColor,var(--bgColor-emphasis,var(--color-neutral-emphasis-plus,#6e7681)));border-radius:6px;opacity:0;}/*!sc*/
@-webkit-keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
@keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
.fiSvBN:hover::after,.fiSvBN:active::after,.fiSvBN:focus::after,.fiSvBN:focus-within::after{display:inline-block;-webkit-text-decoration:none;text-decoration:none;-webkit-animation-name:tooltip-appear;animation-name:tooltip-appear;-webkit-animation-duration:0.1s;animation-duration:0.1s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in;-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
.fiSvBN.tooltipped-no-delay:hover::after,.fiSvBN.tooltipped-no-delay:active::after,.fiSvBN.tooltipped-no-delay:focus::after,.fiSvBN.tooltipped-no-delay:focus-within::after{-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
.fiSvBN.tooltipped-multiline:hover::after,.fiSvBN.tooltipped-multiline:active::after,.fiSvBN.tooltipped-multiline:focus::after,.fiSvBN.tooltipped-multiline:focus-within::after{display:table-cell;}/*!sc*/
.fiSvBN.tooltipped-s::after,.fiSvBN.tooltipped-se::after,.fiSvBN.tooltipped-sw::after{top:100%;right:50%;margin-top:6px;}/*!sc*/
.fiSvBN.tooltipped-se::after{right:auto;left:50%;margin-left:-16px;}/*!sc*/
.fiSvBN.tooltipped-sw::after{margin-right:-16px;}/*!sc*/
.fiSvBN.tooltipped-n::after,.fiSvBN.tooltipped-ne::after,.fiSvBN.tooltipped-nw::after{right:50%;bottom:100%;margin-bottom:6px;}/*!sc*/
.fiSvBN.tooltipped-ne::after{right:auto;left:50%;margin-left:-16px;}/*!sc*/
.fiSvBN.tooltipped-nw::after{margin-right:-16px;}/*!sc*/
.fiSvBN.tooltipped-s::after,.fiSvBN.tooltipped-n::after{-webkit-transform:translateX(50%);-ms-transform:translateX(50%);transform:translateX(50%);}/*!sc*/
.fiSvBN.tooltipped-w::after{right:100%;bottom:50%;margin-right:6px;-webkit-transform:translateY(50%);-ms-transform:translateY(50%);transform:translateY(50%);}/*!sc*/
.fiSvBN.tooltipped-e::after{bottom:50%;left:100%;margin-left:6px;-webkit-transform:translateY(50%);-ms-transform:translateY(50%);transform:translateY(50%);}/*!sc*/
.fiSvBN.tooltipped-multiline::after{width:-webkit-max-content;width:-moz-max-content;width:max-content;max-width:250px;word-wrap:break-word;white-space:pre-line;border-collapse:separate;}/*!sc*/
.fiSvBN.tooltipped-multiline.tooltipped-s::after,.fiSvBN.tooltipped-multiline.tooltipped-n::after{right:auto;left:50%;-webkit-transform:translateX(-50%);-ms-transform:translateX(-50%);transform:translateX(-50%);}/*!sc*/
.fiSvBN.tooltipped-multiline.tooltipped-w::after,.fiSvBN.tooltipped-multiline.tooltipped-e::after{right:100%;}/*!sc*/
.fiSvBN.tooltipped-align-right-2::after{right:0;margin-right:0;}/*!sc*/
.fiSvBN.tooltipped-align-left-2::after{left:0;margin-left:0;}/*!sc*/
data-styled.g17[id="Tooltip__TooltipBase-sc-17tf59c-0"]{content:"fiSvBN,"}/*!sc*/
.eAtkQz{display:inline-block;overflow:hidden;text-overflow:ellipsis;vertical-align:top;white-space:nowrap;max-width:125px;max-width:100%;}/*!sc*/
.btDQYJ{display:inherit;overflow:hidden;text-overflow:ellipsis;vertical-align:initial;white-space:nowrap;max-width:125px;max-width:180px;display:block;}/*!sc*/
data-styled.g19[id="Truncate__StyledTruncate-sc-23o1d2-0"]{content:"eAtkQz,btDQYJ,"}/*!sc*/
.kQyrwv{--segmented-control-button-inner-padding:12px;--segmented-control-button-bg-inset:4px;--segmented-control-outer-radius:6px;background-color:transparent;border-color:transparent;border-radius:var(--segmented-control-outer-radius);border-width:0;color:currentColor;cursor:pointer;font-family:inherit;font-size:inherit;font-weight:600;padding:0;height:100%;width:100%;}/*!sc*/
.kQyrwv:focus:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.kQyrwv:focus:not(:disabled):not(:focus-visible){outline:solid 1px transparent;}/*!sc*/
.kQyrwv:focus-visible:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.kQyrwv .segmentedControl-content{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;background-color:var(--controlKnob-bgColor-rest,var(--color-segmented-control-button-bg,#0d1117));border-color:var(--controlKnob-borderColor-rest,var(--color-segmented-control-button-selected-border,#6e7681));border-style:solid;border-width:1px;border-radius:var(--segmented-control-outer-radius);display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;height:100%;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;padding-left:var(--segmented-control-button-inner-padding);padding-right:var(--segmented-control-button-inner-padding);}/*!sc*/
.kQyrwv svg{fill:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.kQyrwv:focus:focus-visible:not(:last-child):after{width:0;}/*!sc*/
.kQyrwv .segmentedControl-text:after{content:"Code";display:block;font-weight:600;height:0;overflow:hidden;pointer-events:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;visibility:hidden;}/*!sc*/
@media (pointer:coarse){.kQyrwv:before{content:"";position:absolute;left:0;right:0;-webkit-transform:translateY(-50%);-ms-transform:translateY(-50%);transform:translateY(-50%);top:50%;min-height:44px;}}/*!sc*/
.gKyOFO{--segmented-control-button-inner-padding:12px;--segmented-control-button-bg-inset:4px;--segmented-control-outer-radius:6px;background-color:transparent;border-color:transparent;border-radius:var(--segmented-control-outer-radius);border-width:0;color:currentColor;cursor:pointer;font-family:inherit;font-size:inherit;font-weight:400;padding:var(--segmented-control-button-bg-inset);height:100%;width:100%;}/*!sc*/
.gKyOFO:focus:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.gKyOFO:focus:not(:disabled):not(:focus-visible){outline:solid 1px transparent;}/*!sc*/
.gKyOFO:focus-visible:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.gKyOFO .segmentedControl-content{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;background-color:transparent;border-color:transparent;border-style:solid;border-width:1px;border-radius:calc(var(--segmented-control-outer-radius) - var(--segmented-control-button-bg-inset) / 2);display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;height:100%;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;padding-left:calc(var(--segmented-control-button-inner-padding) - var(--segmented-control-button-bg-inset));padding-right:calc(var(--segmented-control-button-inner-padding) - var(--segmented-control-button-bg-inset));}/*!sc*/
.gKyOFO svg{fill:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.gKyOFO:hover .segmentedControl-content{background-color:var(--controlTrack-bgColor-hover,var(--color-segmented-control-button-hover-bg,#30363d));}/*!sc*/
.gKyOFO:active .segmentedControl-content{background-color:var(--controlTrack-bgColor-active,var(--color-segmented-control-button-active-bg,#21262d));}/*!sc*/
.gKyOFO:focus:focus-visible:not(:last-child):after{width:0;}/*!sc*/
.gKyOFO .segmentedControl-text:after{content:"Blame";display:block;font-weight:600;height:0;overflow:hidden;pointer-events:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;visibility:hidden;}/*!sc*/
@media (pointer:coarse){.gKyOFO:before{content:"";position:absolute;left:0;right:0;-webkit-transform:translateY(-50%);-ms-transform:translateY(-50%);transform:translateY(-50%);top:50%;min-height:44px;}}/*!sc*/
data-styled.g105[id="SegmentedControlButton__SegmentedControlButtonStyled-sc-8lkgxl-0"]{content:"kQyrwv,gKyOFO,"}/*!sc*/
.eYPFoP{background-color:var(--controlTrack-bgColor-rest,var(--color-segmented-control-bg,rgba(110,118,129,0.1)));border-radius:6px;border:1px solid;border-color:var(--controlTrack-borderColor-rest,transparent);display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;font-size:14px;height:28px;margin:0;padding:0;}/*!sc*/
data-styled.g107[id="SegmentedControl__SegmentedControlList-sc-1rzig82-0"]{content:"eYPFoP,"}/*!sc*/
body[data-page-layout-dragging="true"]{cursor:col-resize;}/*!sc*/
body[data-page-layout-dragging="true"] *{-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;}/*!sc*/
data-styled.g108[id="sc-global-gbKrvU1"]{content:"sc-global-gbKrvU1,"}/*!sc*/
.cJWUiG{list-style:none;padding:0;margin:0;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item{outline:none;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item:focus-visible > div,.cJWUiG .PRIVATE_TreeView-item.focus-visible > div{box-shadow:inset 0 0 0 2px var(--fgColor-accent,var(--color-accent-fg,#2f81f7));}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item:focus-visible > div,.cJWUiG .PRIVATE_TreeView-item.focus-visible > div{outline:2px solid HighlightText;outline-offset:-2;}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item[data-has-leading-action]{--has-leading-action:1;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-container{--level:1;--toggle-width:1rem;--min-item-height:2rem;position:relative;display:grid;--leading-action-width:calc(var(--has-leading-action,0) * 1.5rem);--spacer-width:calc(calc(var(--level) - 1) * (var(--toggle-width) / 2));grid-template-columns:var(--spacer-width) var(--leading-action-width) var(--toggle-width) 1fr;grid-template-areas:'spacer leadingAction toggle content';width:100%;font-size:14px;color:var(--fgColor-default,var(--color-fg-default,#e6edf3));border-radius:6px;cursor:pointer;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-container:hover{background-color:var(--control-transparent-bgColor-hover,var(--color-action-list-item-default-hover-bg,rgba(177,186,196,0.12)));}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item-container:hover{outline:2px solid transparent;outline-offset:-2px;}}/*!sc*/
@media (pointer:coarse){.cJWUiG .PRIVATE_TreeView-item-container{--toggle-width:1.5rem;--min-item-height:2.75rem;}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-container:has(.PRIVATE_TreeView-item-skeleton):hover{background-color:transparent;cursor:default;}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item-container:has(.PRIVATE_TreeView-item-skeleton):hover{outline:none;}}/*!sc*/
.cJWUiG[data-omit-spacer='true'] .PRIVATE_TreeView-item-container{grid-template-columns:0 0 0 1fr;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item[aria-current='true'] > .PRIVATE_TreeView-item-container{background-color:var(--control-transparent-bgColor-selected,var(--color-action-list-item-default-selected-bg,rgba(177,186,196,0.08)));}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item[aria-current='true'] > .PRIVATE_TreeView-item-container::after{content:'';position:absolute;top:calc(50% - 0.75rem);left:-8px;width:0.25rem;height:1.5rem;background-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));border-radius:6px;}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item[aria-current='true'] > .PRIVATE_TreeView-item-container::after{background-color:HighlightText;}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-toggle{grid-area:toggle;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;-webkit-align-items:flex-start;-webkit-box-align:flex-start;-ms-flex-align:flex-start;align-items:flex-start;padding-top:calc(var(--min-item-height) / 2 - 12px / 2);height:100%;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-toggle--hover:hover{background-color:var(--control-transparent-bgColor-hover,var(--color-tree-view-item-chevron-hover-bg,rgba(177,186,196,0.12)));}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-toggle--end{border-top-left-radius:6px;border-bottom-left-radius:6px;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-content{grid-area:content;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;height:100%;padding:0 8px;gap:8px;line-height:var(--custom-line-height,var(--text-body-lineHeight-medium,1.4285));padding-top:calc((var(--min-item-height) - var(--custom-line-height,1.3rem)) / 2);padding-bottom:calc((var(--min-item-height) - var(--custom-line-height,1.3rem)) / 2);}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-content-text{-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;width:0;}/*!sc*/
.cJWUiG[data-truncate-text='true'] .PRIVATE_TreeView-item-content-text{overflow:hidden;white-space:nowrap;text-overflow:ellipsis;}/*!sc*/
.cJWUiG[data-truncate-text='false'] .PRIVATE_TreeView-item-content-text{word-break:break-word;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-visual{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));height:var(--custom-line-height,1.3rem);}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-leading-action{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));grid-area:leadingAction;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-leading-action > button{-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-level-line{width:100%;height:100%;border-right:1px solid;border-color:var(--borderColor-muted,var(--color-border-subtle,rgba(240,246,252,0.1)));}/*!sc*/
@media (hover:hover){.cJWUiG .PRIVATE_TreeView-item-level-line{border-color:transparent;}.cJWUiG:hover .PRIVATE_TreeView-item-level-line,.cJWUiG:focus-within .PRIVATE_TreeView-item-level-line{border-color:var(--borderColor-muted,var(--color-border-subtle,rgba(240,246,252,0.1)));}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-directory-icon{display:grid;color:var(--treeViewItem-leadingVisual-iconColor-rest,var(--color-tree-view-item-directory-fill,#848d97));}/*!sc*/
.cJWUiG .PRIVATE_VisuallyHidden{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;-webkit-clip:rect(0,0,0,0);clip:rect(0,0,0,0);white-space:nowrap;border-width:0;}/*!sc*/
data-styled.g114[id="TreeView__UlBox-sc-4ex6b6-0"]{content:"cJWUiG,"}/*!sc*/
</style><meta data-hydrostats="publish"/> <!-- --> <!-- --> <button hidden="" data-testid="header-permalink-button" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><div><div style="--sticky-pane-height:100vh;--spacing:var(--spacing-none)" class="Box-sc-g0xbh4-0 hOfjFo"><div class="Box-sc-g0xbh4-0 oDGAe"><div class="Box-sc-g0xbh4-0 kowOcT"><div tabindex="0" class="Box-sc-g0xbh4-0 gISSDQ"><div class="Box-sc-g0xbh4-0 fHCyST"><div class="Box-sc-g0xbh4-0 hPvFuC"></div><div style="--pane-width:320px" class="Box-sc-g0xbh4-0 fFSoPl"><div class="react-tree-pane-contents-3-panel"><div id="repos-file-tree" class="Box-sc-g0xbh4-0 birIjn"><div class="Box-sc-g0xbh4-0 hNNCwk"><div class="Box-sc-g0xbh4-0 jfIeyl"><h2 class="Box-sc-g0xbh4-0 XosP prc-Heading-Heading-6CmGO"><button style="--button-color:fg.muted" type="button" aria-label="Expand file tree" data-testid="expand-file-tree-button-mobile" class="Box-sc-g0xbh4-0 hMLRgO prc-Button-ButtonBase-c50BI" data-loading="false" data-size="medium" data-variant="invisible" aria-describedby=":Rl6mplab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-arrow-left" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M7.78 12.53a.75.75 0 0 1-1.06 0L2.47 8.28a.75.75 0 0 1 0-1.06l4.25-4.25a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L4.81 7h7.44a.75.75 0 0 1 0 1.5H4.81l2.97 2.97a.75.75 0 0 1 0 1.06Z"></path></svg></span><span data-component="text" class="prc-Button-Label-pTQ3x">Files</span></span></button><span role="tooltip" aria-label="Collapse file tree" id="expand-button-file-tree-button" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-se"><button data-component="IconButton" type="button" data-testid="collapse-file-tree-button" aria-expanded="true" aria-controls="repos-file-tree" class="prc-Button-ButtonBase-c50BI position-relative ExpandFileTreeButton-module__expandButton--gL4is ExpandFileTreeButton-module__filesButtonBreakpoint--WfX9t fgColor-muted prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="invisible" aria-describedby=":R356mplab:-loading-announcement" aria-labelledby="expand-button-file-tree-button"><svg aria-hidden="true" focusable="false" class="octicon octicon-sidebar-expand" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m4.177 7.823 2.396-2.396A.25.25 0 0 1 7 5.604v4.792a.25.25 0 0 1-.427.177L4.177 8.177a.25.25 0 0 1 0-.354Z"></path><path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25H9.5v-13Zm12.5 13a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25H11v13Z"></path></svg></button></span><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button></h2><h2 class="Box-sc-g0xbh4-0 kOkWgo prc-Heading-Heading-6CmGO">Files</h2></div><div class="Box-sc-g0xbh4-0 lhbroM"><div class="Box-sc-g0xbh4-0 khzwtX"><button type="button" aria-haspopup="true" aria-expanded="false" tabindex="0" aria-label="main branch" data-testid="anchor-button" class="Box-sc-g0xbh4-0 JMXqM prc-Button-ButtonBase-c50BI react-repos-tree-pane-ref-selector width-full ref-selector-class" data-loading="false" data-size="medium" data-variant="default" aria-describedby="branch-picker-repos-header-ref-selector-loading-announcement" id="branch-picker-repos-header-ref-selector"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x"><div class="Box-sc-g0xbh4-0 bZBlpz"><div class="Box-sc-g0xbh4-0 bJjzmO"><svg aria-hidden="true" focusable="false" class="octicon octicon-git-branch" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M9.5 3.25a2.25 2.25 0 1 1 3 2.122V6A2.5 2.5 0 0 1 10 8.5H6a1 1 0 0 0-1 1v1.128a2.251 2.251 0 1 1-1.5 0V5.372a2.25 2.25 0 1 1 1.5 0v1.836A2.493 2.493 0 0 1 6 7h4a1 1 0 0 0 1-1v-.628A2.25 2.25 0 0 1 9.5 3.25Zm-6 0a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Zm8.25-.75a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5ZM4.25 12a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Z"></path></svg></div><div class="Box-sc-g0xbh4-0 ffLUq ref-selector-button-text-container"><span class="Text__StyledText-sc-17v1xeu-0 eMMFM"> <!-- -->main</span></div></div></span><span data-component="trailingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-triangle-down" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path></svg></span></span></button><button hidden="" data-hotkey-scope="read-only-cursor-text-area"></button></div><div class="Box-sc-g0xbh4-0 eTeVqd"><a sx="[object Object]" data-component="IconButton" type="button" aria-label="Add file" class="Box-sc-g0xbh4-0 jNHrPP prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R6q6mplab:-loading-announcement :Rq6mplab:" href="/Alimiji/Solr_utilisation/new/main"><svg aria-hidden="true" focusable="false" class="octicon octicon-plus" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M7.75 2a.75.75 0 0 1 .75.75V7h4.25a.75.75 0 0 1 0 1.5H8.5v4.25a.75.75 0 0 1-1.5 0V8.5H2.75a.75.75 0 0 1 0-1.5H7V2.75A.75.75 0 0 1 7.75 2Z"></path></svg></a><span class="Tooltip__StyledTooltip-sc-e45c7z-0 fLAhLl" data-direction="s" aria-label="Add file" role="tooltip" aria-hidden="true" id=":Rq6mplab:">Add file</span><button data-component="IconButton" type="button" aria-label="Search this repository" class="Box-sc-g0xbh4-0 ijefGF prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R3a6mplab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-search" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path></svg></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button></div></div></div><div class="Box-sc-g0xbh4-0 ftzGWg"><span class="TextInputWrapper__StyledTextInputBaseWrapper-sc-1mqhpbi-0 bclhiL TextInputWrapper__StyledTextInputWrapper-sc-1mqhpbi-1 jpROxA TextInput-wrapper" data-leading-visual="true" data-trailing-visual="true" aria-busy="false"><span class="TextInput-icon" id=":R5amplab:" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-search" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path></svg></span><input type="text" aria-label="Go to file" role="combobox" aria-controls="file-results-list" aria-expanded="false" aria-haspopup="dialog" autoCorrect="off" spellcheck="false" placeholder="Go to file" aria-describedby=":R5amplab: :R5amplabH1:" data-component="input" class="UnstyledTextInput__ToggledUnstyledTextInput-sc-14ypya-0 jkNcAv" value=""/><span class="TextInput-icon" id=":R5amplabH1:" aria-hidden="true"><div class="Box-sc-g0xbh4-0 dItACB"><kbd>t</kbd></div></span></span></div><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><div class="Box-sc-g0xbh4-0 gjtfVk"><div class="react-tree-show-tree-items"><div data-testid="repos-file-tree-container" class="Box-sc-g0xbh4-0 cOxzdh"><nav aria-label="File Tree Navigation"><span role="status" aria-live="polite" aria-atomic="true" class="_VisuallyHidden__VisuallyHidden-sc-11jhm7a-0 brGdpi"></span><ul role="tree" aria-label="Files" data-truncate-text="true" class="TreeView__UlBox-sc-4ex6b6-0 cJWUiG"><li class="PRIVATE_TreeView-item" tabindex="0" id="RI_PySolr (1).pdf-item" role="treeitem" aria-labelledby=":R39implab:" aria-describedby=":R39implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R39implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R39implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>RI_PySolr (1).pdf</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="commande_curl_solr.pdf-item" role="treeitem" aria-labelledby=":R59implab:" aria-describedby=":R59implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R59implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R59implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>commande_curl_solr.pdf</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="extraire_fichiers.py-item" role="treeitem" aria-labelledby=":R79implab:" aria-describedby=":R79implabH1:" aria-level="1" aria-current="true" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R79implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R79implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>extraire_fichiers.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="main.py-item" role="treeitem" aria-labelledby=":R99implab:" aria-describedby=":R99implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R99implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R99implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>main.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="notes.txt-item" role="treeitem" aria-labelledby=":Rb9implab:" aria-describedby=":Rb9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rb9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rb9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>notes.txt</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="precision_recall.py-item" role="treeitem" aria-labelledby=":Rd9implab:" aria-describedby=":Rd9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rd9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rd9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>precision_recall.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="requete_resultat_solr.py-item" role="treeitem" aria-labelledby=":Rf9implab:" aria-describedby=":Rf9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rf9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rf9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>requete_resultat_solr.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="requetes.py-item" role="treeitem" aria-labelledby=":Rh9implab:" aria-describedby=":Rh9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rh9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rh9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>requetes.py</span></span></div></div></li></ul></nav></div></div></div></div></div></div><div class="Box-sc-g0xbh4-0 bTBnTW"><div role="slider" aria-label="Draggable pane splitter" aria-valuemin="0" aria-valuemax="0" aria-valuenow="0" aria-valuetext="Pane width 0 pixels" tabindex="0" class="Box-sc-g0xbh4-0 fFMzrG"></div></div></div></div><div class="Box-sc-g0xbh4-0 iKqMNA"><div class="Box-sc-g0xbh4-0"></div><div class="Box-sc-g0xbh4-0 FxAyp"><div data-selector="repos-split-pane-content" tabindex="0" class="Box-sc-g0xbh4-0 leYMvG"><div class="Box-sc-g0xbh4-0 KMPzq"><div class="Box-sc-g0xbh4-0 hfKjHv container"><div class="px-3 pt-3 pb-0" id="StickyHeader"><div class="Box-sc-g0xbh4-0 gZWyZE"><div class="Box-sc-g0xbh4-0 dwYKDk"><div class="Box-sc-g0xbh4-0 iDtIiT"><div class="Box-sc-g0xbh4-0 cEytCf"><nav data-testid="breadcrumbs" aria-labelledby="repos-header-breadcrumb--wide-heading" id="repos-header-breadcrumb--wide" class="Box-sc-g0xbh4-0 fzFXnm"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading" id="repos-header-breadcrumb--wide-heading">Breadcrumbs</h2><ol class="Box-sc-g0xbh4-0 iMnkmv"><li class="Box-sc-g0xbh4-0 ghzDag"><a class="Box-sc-g0xbh4-0 kHuKdh prc-Link-Link-85e08" sx="[object Object]" data-testid="breadcrumbs-repo-link" href="/Alimiji/Solr_utilisation/tree/main">Solr_utilisation</a></li></ol></nav><div data-testid="breadcrumbs-filename" class="Box-sc-g0xbh4-0 ghzDag"><span class="Text__StyledText-sc-17v1xeu-0 iHQnrN" aria-hidden="true">/</span><h1 class="Box-sc-g0xbh4-0 jGhzSQ prc-Heading-Heading-6CmGO" tabindex="-1" id="file-name-id-wide">extraire_fichiers.py</h1></div><button data-component="IconButton" type="button" class="prc-Button-ButtonBase-c50BI ml-2 prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="invisible" aria-describedby=":R3td9lab:-loading-announcement" aria-labelledby=":Rdd9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-copy" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path></svg></button><span class="Tooltip__StyledTooltip-sc-e45c7z-0 fLAhLl CopyToClipboardButton-module__tooltip--Dq1IB" data-direction="nw" aria-label="Copy path" aria-hidden="true" id=":Rdd9lab:">Copy path</span></div></div><div class="react-code-view-header-element--wide"><div class="Box-sc-g0xbh4-0 faNtbn"><div class="d-flex gap-2"> <button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button type="button" class="Box-sc-g0xbh4-0 dwNhzn prc-Button-ButtonBase-c50BI" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R2l6d9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x">Blame</span></span></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button data-component="IconButton" type="button" aria-label="More file actions" title="More file actions" data-testid="more-file-actions-button-nav-menu-wide" aria-haspopup="true" aria-expanded="false" tabindex="0" class="Box-sc-g0xbh4-0 kVRliy prc-Button-ButtonBase-c50BI js-blob-dropdown-click prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R156d9lab:-loading-announcement" id=":R156d9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-kebab-horizontal" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path></svg></button> </div></div></div><div class="react-code-view-header-element--narrow"><div class="Box-sc-g0xbh4-0 faNtbn"><div class="d-flex gap-2"> <button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button type="button" class="Box-sc-g0xbh4-0 dwNhzn prc-Button-ButtonBase-c50BI" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R2l7d9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x">Blame</span></span></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button data-component="IconButton" type="button" aria-label="More file actions" title="More file actions" data-testid="more-file-actions-button-nav-menu-narrow" aria-haspopup="true" aria-expanded="false" tabindex="0" class="Box-sc-g0xbh4-0 kVRliy prc-Button-ButtonBase-c50BI js-blob-dropdown-click prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R157d9lab:-loading-announcement" id=":R157d9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-kebab-horizontal" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path></svg></button> </div></div></div></div></div></div></div></div><div class="Box-sc-g0xbh4-0 dJxjrT react-code-view-bottom-padding"> <div class="Box-sc-g0xbh4-0 eFxKDQ"></div> <!-- --> <!-- --> </div><div class="Box-sc-g0xbh4-0 dJxjrT"> <!-- --> <!-- --> <button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><div class="d-flex flex-column border rounded-2 mb-3 pl-1"><div class="Box-sc-g0xbh4-0 dzCJzi"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading">Latest commit</h2><div style="width:120px" class="Skeleton Skeleton--text" data-testid="loading"> </div><div class="d-flex flex-shrink-0 gap-2"><div data-testid="latest-commit-details" class="d-none d-sm-flex flex-items-center"></div><div class="d-flex gap-2"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading">History</h2><a href="/Alimiji/Solr_utilisation/commits/main/extraire_fichiers.py" class="prc-Button-ButtonBase-c50BI d-none d-lg-flex LinkButton-module__code-view-link-button--xvCGA flex-items-center fgColor-default" data-loading="false" data-size="small" data-variant="invisible" aria-describedby=":R5dlal9lab:-loading-announcement"><span data-component="buttonContent" data-align="center" class="prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-history" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m.427 1.927 1.215 1.215a8.002 8.002 0 1 1-1.6 5.685.75.75 0 1 1 1.493-.154 6.5 6.5 0 1 0 1.18-4.458l1.358 1.358A.25.25 0 0 1 3.896 6H.25A.25.25 0 0 1 0 5.75V2.104a.25.25 0 0 1 .427-.177ZM7.75 4a.75.75 0 0 1 .75.75v2.992l2.028.812a.75.75 0 0 1-.557 1.392l-2.5-1A.751.751 0 0 1 7 8.25v-3.5A.75.75 0 0 1 7.75 4Z"></path></svg></span><span data-component="text" class="prc-Button-Label-pTQ3x"><span class="fgColor-default">History</span></span></span></a><div class="d-sm-none"></div><div class="d-flex d-lg-none"><span role="tooltip" aria-label="History" id="history-icon-button-tooltip" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-n"><a href="/Alimiji/Solr_utilisation/commits/main/extraire_fichiers.py" class="prc-Button-ButtonBase-c50BI LinkButton-module__code-view-link-button--xvCGA flex-items-center fgColor-default" data-loading="false" data-size="small" data-variant="invisible" aria-describedby=":Rpdlal9lab:-loading-announcement history-icon-button-tooltip"><span data-component="buttonContent" data-align="center" class="prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-history" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m.427 1.927 1.215 1.215a8.002 8.002 0 1 1-1.6 5.685.75.75 0 1 1 1.493-.154 6.5 6.5 0 1 0 1.18-4.458l1.358 1.358A.25.25 0 0 1 3.896 6H.25A.25.25 0 0 1 0 5.75V2.104a.25.25 0 0 1 .427-.177ZM7.75 4a.75.75 0 0 1 .75.75v2.992l2.028.812a.75.75 0 0 1-.557 1.392l-2.5-1A.751.751 0 0 1 7 8.25v-3.5A.75.75 0 0 1 7.75 4Z"></path></svg></span></span></a></span></div></div></div></div></div><div class="Box-sc-g0xbh4-0 ldRxiI"><div class="Box-sc-g0xbh4-0 efRoCL container"><div class="Box-sc-g0xbh4-0 gNAmSV react-code-size-details-banner"><div class="Box-sc-g0xbh4-0 jNEwzY react-code-size-details-banner"><div class="Box-sc-g0xbh4-0 ifyOQK text-mono"><div title="6.04 KB" data-testid="blob-size" class="Truncate__StyledTruncate-sc-23o1d2-0 eAtkQz"><span>171 lines (131 loc) · 6.04 KB</span></div></div></div></div><div class="Box-sc-g0xbh4-0 jdLMhu react-blob-view-header-sticky" id="repos-sticky-header"><div class="Box-sc-g0xbh4-0 tOISc"><div class="react-blob-sticky-header"><div class="Box-sc-g0xbh4-0 hqwSEx"><div class="Box-sc-g0xbh4-0 lzKZY"><div class="Box-sc-g0xbh4-0 fHind"><nav data-testid="breadcrumbs" aria-labelledby="sticky-breadcrumb-heading" id="sticky-breadcrumb" class="Box-sc-g0xbh4-0 fzFXnm"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading" id="sticky-breadcrumb-heading">Breadcrumbs</h2><ol class="Box-sc-g0xbh4-0 iMnkmv"><li class="Box-sc-g0xbh4-0 ghzDag"><a class="Box-sc-g0xbh4-0 kHuKdh prc-Link-Link-85e08" sx="[object Object]" data-testid="breadcrumbs-repo-link" href="/Alimiji/Solr_utilisation/tree/main">Solr_utilisation</a></li></ol></nav><div data-testid="breadcrumbs-filename" class="Box-sc-g0xbh4-0 ghzDag"><span class="Text__StyledText-sc-17v1xeu-0 wcuBT" aria-hidden="true">/</span><h1 class="Box-sc-g0xbh4-0 dnZoUW prc-Heading-Heading-6CmGO" tabindex="-1" id="sticky-file-name-id">extraire_fichiers.py</h1></div></div><button style="--button-color:fg.default" type="button" class="Box-sc-g0xbh4-0 dpNnZU prc-Button-ButtonBase-c50BI" data-loading="false" data-size="small" data-variant="invisible" aria-describedby=":Riptal9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-arrow-up" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M3.47 7.78a.75.75 0 0 1 0-1.06l4.25-4.25a.75.75 0 0 1 1.06 0l4.25 4.25a.751.751 0 0 1-.018 1.042.751.751 0 0 1-1.042.018L9 4.81v7.44a.75.75 0 0 1-1.5 0V4.81L4.53 7.78a.75.75 0 0 1-1.06 0Z"></path></svg></span><span data-component="text" class="prc-Button-Label-pTQ3x">Top</span></span></button></div></div></div><div class="Box-sc-g0xbh4-0 gpHFJV"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading">File metadata and controls</h2><div class="Box-sc-g0xbh4-0 iNMjfP"><ul aria-label="File view" class="SegmentedControl__SegmentedControlList-sc-1rzig82-0 eYPFoP" data-size="small"><li class="Box-sc-g0xbh4-0 fefCSX" data-selected="true"><button aria-current="true" class="SegmentedControlButton__SegmentedControlButtonStyled-sc-8lkgxl-0 kQyrwv" type="button"><span class="segmentedControl-content"><div class="Box-sc-g0xbh4-0 segmentedControl-text" data-text="Code">Code</div></span></button></li><li class="Box-sc-g0xbh4-0 sulSy"><button aria-current="false" class="SegmentedControlButton__SegmentedControlButtonStyled-sc-8lkgxl-0 gKyOFO" type="button"><span class="segmentedControl-content"><div class="Box-sc-g0xbh4-0 segmentedControl-text" data-text="Blame">Blame</div></span></button></li></ul><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><div class="Box-sc-g0xbh4-0 jNEwzY react-code-size-details-in-header"><div class="Box-sc-g0xbh4-0 ifyOQK text-mono"><div title="6.04 KB" data-testid="blob-size" class="Truncate__StyledTruncate-sc-23o1d2-0 eAtkQz"><span>171 lines (131 loc) · 6.04 KB</span></div></div></div></div><div class="Box-sc-g0xbh4-0 kcLCKF"><div class="Box-sc-g0xbh4-0 pr-0 prc-ButtonGroup-ButtonGroup-vcMeG"><div><button data-component="IconButton" type="button" data-testid="copilot-ask-menu" class="prc-Button-ButtonBase-c50BI AskCopilotButton-module__square--o8kDO prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby="blob-view-header-copilot-icon-loading-announcement" aria-labelledby=":Rbsptal9lab:" id="blob-view-header-copilot-icon"><svg aria-hidden="true" focusable="false" class="octicon octicon-copilot" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path></svg></button><span class="Tooltip__StyledTooltip-sc-e45c7z-0 fLAhLl" data-direction="s" aria-hidden="true" id=":Rbsptal9lab:">Ask Copilot about this file</span></div><div></div></div><div class="Box-sc-g0xbh4-0 kVWtTz react-blob-header-edit-and-raw-actions"><div class="Box-sc-g0xbh4-0 prc-ButtonGroup-ButtonGroup-vcMeG"><div><a href="https://github.com/Alimiji/Solr_utilisation/raw/refs/heads/main/extraire_fichiers.py" data-testid="raw-button" class="Box-sc-g0xbh4-0 gWqxTd prc-Button-ButtonBase-c50BI" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":R5csptal9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x">Raw</span></span></a></div><div><button data-component="IconButton" type="button" aria-label="Copy raw content" data-testid="copy-raw-button" class="prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Rpcsptal9lab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-copy" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path></svg></button></div><div><span role="tooltip" aria-label="Download raw file" id=":Rdcsptal9lab:" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-n"><button data-component="IconButton" type="button" aria-label="Download raw content" data-testid="download-raw-button" class="Box-sc-g0xbh4-0 ivobqY prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Rtcsptal9lab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-download" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2.75 14A1.75 1.75 0 0 1 1 12.25v-2.5a.75.75 0 0 1 1.5 0v2.5c0 .138.112.25.25.25h10.5a.25.25 0 0 0 .25-.25v-2.5a.75.75 0 0 1 1.5 0v2.5A1.75 1.75 0 0 1 13.25 14Z"></path><path d="M7.25 7.689V2a.75.75 0 0 1 1.5 0v5.689l1.97-1.969a.749.749 0 1 1 1.06 1.06l-3.25 3.25a.749.749 0 0 1-1.06 0L4.22 6.78a.749.749 0 1 1 1.06-1.06l1.97 1.969Z"></path></svg></button></span></div></div><button hidden="" data-testid="raw-button-shortcut" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden="" data-testid="copy-raw-button-shortcut" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden="" data-testid="download-raw-button-shortcut" data-hotkey-scope="read-only-cursor-text-area"></button><a class="js-github-dev-shortcut d-none prc-Link-Link-85e08" href="https://github.dev/"></a><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><a class="js-github-dev-new-tab-shortcut d-none prc-Link-Link-85e08" href="https://github.dev/" target="_blank"></a><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><div class="Box-sc-g0xbh4-0 prc-ButtonGroup-ButtonGroup-vcMeG"><div><span role="tooltip" aria-label="Edit the file in your fork of this project" id=":R6ksptal9lab:" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-nw"><a sx="[object Object]" data-component="IconButton" type="button" aria-label="Edit file" data-testid="edit-button" class="Box-sc-g0xbh4-0 kilKoS prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Rmksptal9lab:-loading-announcement" href="/Alimiji/Solr_utilisation/edit/main/extraire_fichiers.py"><svg aria-hidden="true" focusable="false" class="octicon octicon-pencil" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M11.013 1.427a1.75 1.75 0 0 1 2.474 0l1.086 1.086a1.75 1.75 0 0 1 0 2.474l-8.61 8.61c-.21.21-.47.364-.756.445l-3.251.93a.75.75 0 0 1-.927-.928l.929-3.25c.081-.286.235-.547.445-.758l8.61-8.61Zm.176 4.823L9.75 4.81l-6.286 6.287a.253.253 0 0 0-.064.108l-.558 1.953 1.953-.558a.253.253 0 0 0 .108-.064Zm1.238-3.763a.25.25 0 0 0-.354 0L10.811 3.75l1.439 1.44 1.263-1.263a.25.25 0 0 0 0-.354Z"></path></svg></a></span></div><div><button data-component="IconButton" type="button" aria-label="More edit options" data-testid="more-edit-button" aria-haspopup="true" aria-expanded="false" tabindex="0" class="prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Raksptal9lab:-loading-announcement" id=":Raksptal9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-triangle-down" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path></svg></button></div></div><button hidden="" data-testid="" data-hotkey="e,Shift+E" data-hotkey-scope="read-only-cursor-text-area"></button></div><span role="tooltip" aria-label="Close symbols panel" id=":R5sptal9lab:" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-nw"><button data-component="IconButton" type="button" aria-label="Symbols" aria-pressed="true" aria-expanded="true" aria-controls="symbols-pane" data-testid="symbols-button" class="Box-sc-g0xbh4-0 hySUEo prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="invisible" aria-describedby="symbols-button-loading-announcement" id="symbols-button"><svg aria-hidden="true" focusable="false" class="octicon octicon-code-square" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25Zm7.47 3.97a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L10.69 8 9.22 6.53a.75.75 0 0 1 0-1.06ZM6.78 6.53 5.31 8l1.47 1.47a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215l-2-2a.75.75 0 0 1 0-1.06l2-2a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path></svg></button></span><div class="react-blob-header-edit-and-raw-actions-combined"><button data-component="IconButton" type="button" aria-label="Edit and raw actions" title="More file actions" data-testid="more-file-actions-button" aria-haspopup="true" aria-expanded="false" tabindex="0" class="Box-sc-g0xbh4-0 itGLhU prc-Button-ButtonBase-c50BI js-blob-dropdown-click prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="invisible" aria-describedby=":Rnsptal9lab:-loading-announcement" id=":Rnsptal9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-kebab-horizontal" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path></svg></button></div></div></div></div><div></div></div><div class="Box-sc-g0xbh4-0 hycJXc"><section aria-labelledby="file-name-id-wide file-name-id-mobile" class="Box-sc-g0xbh4-0 dceWRL"><div class="Box-sc-g0xbh4-0 dGXHv"><div id="highlighted-line-menu-positioner" class="position-relative"><div id="copilot-button-positioner" class="Box-sc-g0xbh4-0 bpDFns"><div class="Box-sc-g0xbh4-0 iJOeCH"><div class="Box-sc-g0xbh4-0 jewUnv react-code-file-contents" role="presentation" aria-hidden="true" data-tab-size="8" data-paste-markdown-skip="true" data-hpc="true"><div class="react-line-numbers" style="pointer-events:auto"><div data-line-number="1" class="react-line-number react-code-text" style="padding-right:16px">1</div><div data-line-number="2" class="react-line-number react-code-text" style="padding-right:16px">2</div><div data-line-number="3" class="react-line-number react-code-text" style="padding-right:16px">3</div><div data-line-number="4" class="react-line-number react-code-text" style="padding-right:16px">4</div><div data-line-number="5" class="react-line-number react-code-text" style="padding-right:16px">5</div><div data-line-number="6" class="react-line-number react-code-text" style="padding-right:16px">6</div><div data-line-number="7" class="react-line-number react-code-text" style="padding-right:16px">7</div><div data-line-number="8" class="react-line-number react-code-text" style="padding-right:16px">8</div><div data-line-number="9" class="react-line-number react-code-text" style="padding-right:16px">9</div><div data-line-number="10" class="react-line-number react-code-text" style="padding-right:16px">10</div><div data-line-number="11" class="react-line-number react-code-text" style="padding-right:16px">11</div><div data-line-number="12" class="react-line-number react-code-text" style="padding-right:16px">12</div><div data-line-number="13" class="react-line-number react-code-text" style="padding-right:16px">13</div><div data-line-number="14" class="react-line-number react-code-text" style="padding-right:16px">14</div><div data-line-number="15" class="react-line-number react-code-text" style="padding-right:16px">15</div><div data-line-number="16" class="react-line-number react-code-text" style="padding-right:16px">16</div><div data-line-number="17" class="react-line-number react-code-text" style="padding-right:16px">17</div><div data-line-number="18" class="react-line-number react-code-text" style="padding-right:16px">18</div><div data-line-number="19" class="react-line-number react-code-text" style="padding-right:16px">19</div><div data-line-number="20" class="react-line-number react-code-text" style="padding-right:16px">20</div><div data-line-number="21" class="react-line-number react-code-text" style="padding-right:16px">21</div><div data-line-number="22" class="react-line-number react-code-text" style="padding-right:16px">22</div><div data-line-number="23" class="react-line-number react-code-text" style="padding-right:16px">23</div><div data-line-number="24" class="react-line-number react-code-text" style="padding-right:16px">24</div><div data-line-number="25" class="react-line-number react-code-text" style="padding-right:16px">25</div><div data-line-number="26" class="react-line-number react-code-text" style="padding-right:16px">26</div><div data-line-number="27" class="react-line-number react-code-text" style="padding-right:16px">27</div><div data-line-number="28" class="react-line-number react-code-text" style="padding-right:16px">28</div><div data-line-number="29" class="react-line-number react-code-text" style="padding-right:16px">29</div><div data-line-number="30" class="react-line-number react-code-text" style="padding-right:16px">30</div><div data-line-number="31" class="react-line-number react-code-text" style="padding-right:16px">31</div><div data-line-number="32" class="react-line-number react-code-text" style="padding-right:16px">32</div><div data-line-number="33" class="react-line-number react-code-text" style="padding-right:16px">33</div><div data-line-number="34" class="react-line-number react-code-text" style="padding-right:16px">34</div><div data-line-number="35" class="react-line-number react-code-text" style="padding-right:16px">35</div><div data-line-number="36" class="react-line-number react-code-text" style="padding-right:16px">36</div><div data-line-number="37" class="react-line-number react-code-text" style="padding-right:16px">37</div><div data-line-number="38" class="react-line-number react-code-text" style="padding-right:16px">38</div><div data-line-number="39" class="react-line-number react-code-text" style="padding-right:16px">39</div><div data-line-number="40" class="react-line-number react-code-text" style="padding-right:16px">40</div><div data-line-number="41" class="react-line-number react-code-text" style="padding-right:16px">41</div><div data-line-number="42" class="react-line-number react-code-text" style="padding-right:16px">42</div><div data-line-number="43" class="react-line-number react-code-text" style="padding-right:16px">43<span class="Box-sc-g0xbh4-0 cJGaMs"><div aria-label="Collapse code section" role="button" class="Box-sc-g0xbh4-0 iGLarr"><svg aria-hidden="true" focusable="false" class="Octicon-sc-9kayk9-0" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M12.78 5.22a.749.749 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.06 0L3.22 6.28a.749.749 0 1 1 1.06-1.06L8 8.939l3.72-3.719a.749.749 0 0 1 1.06 0Z"></path></svg></div></span></div><div data-line-number="44" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">44</div><div data-line-number="45" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">45</div><div data-line-number="46" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">46</div><div data-line-number="47" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">47</div><div data-line-number="48" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">48</div><div data-line-number="49" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">49</div><div data-line-number="50" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">50</div><div data-line-number="51" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">51</div><div data-line-number="52" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">52</div><div data-line-number="53" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">53</div><div data-line-number="54" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">54</div><div data-line-number="55" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">55</div><div data-line-number="56" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">56</div><div data-line-number="57" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">57</div><div data-line-number="58" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">58</div><div data-line-number="59" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">59</div><div data-line-number="60" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">60</div><div data-line-number="61" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">61</div><div data-line-number="62" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">62</div><div data-line-number="63" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">63</div><div data-line-number="64" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">64</div><div data-line-number="65" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">65</div><div data-line-number="66" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">66</div><div data-line-number="67" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">67</div><div data-line-number="68" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">68</div><div data-line-number="69" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">69</div><div data-line-number="70" class="react-line-number react-code-text" style="padding-right:16px">70</div><div data-line-number="71" class="react-line-number react-code-text" style="padding-right:16px">71</div><div data-line-number="72" class="react-line-number react-code-text" style="padding-right:16px">72</div><div data-line-number="73" class="react-line-number react-code-text" style="padding-right:16px">73</div><div data-line-number="74" class="react-line-number react-code-text" style="padding-right:16px">74</div><div data-line-number="75" class="react-line-number react-code-text" style="padding-right:16px">75</div><div data-line-number="76" class="react-line-number react-code-text" style="padding-right:16px">76</div><div data-line-number="77" class="react-line-number react-code-text" style="padding-right:16px">77</div><div data-line-number="78" class="react-line-number react-code-text" style="padding-right:16px">78</div><div data-line-number="79" class="react-line-number react-code-text" style="padding-right:16px">79</div><div data-line-number="80" class="react-line-number react-code-text" style="padding-right:16px">80</div><div data-line-number="81" class="react-line-number react-code-text" style="padding-right:16px">81</div><div data-line-number="82" class="react-line-number react-code-text" style="padding-right:16px">82</div><div data-line-number="83" class="react-line-number react-code-text" style="padding-right:16px">83</div><div data-line-number="84" class="react-line-number react-code-text" style="padding-right:16px">84</div><div data-line-number="85" class="react-line-number react-code-text" style="padding-right:16px">85</div><div data-line-number="86" class="react-line-number react-code-text" style="padding-right:16px">86</div><div data-line-number="87" class="react-line-number react-code-text" style="padding-right:16px">87</div><div data-line-number="88" class="react-line-number react-code-text" style="padding-right:16px">88</div><div data-line-number="89" class="react-line-number react-code-text" style="padding-right:16px">89</div><div data-line-number="90" class="react-line-number react-code-text" style="padding-right:16px">90</div><div data-line-number="91" class="react-line-number react-code-text" style="padding-right:16px">91</div><div data-line-number="92" class="react-line-number react-code-text" style="padding-right:16px">92</div><div data-line-number="93" class="react-line-number react-code-text" style="padding-right:16px">93</div><div data-line-number="94" class="react-line-number react-code-text" style="padding-right:16px">94</div><div data-line-number="95" class="react-line-number react-code-text" style="padding-right:16px">95</div><div data-line-number="96" class="react-line-number react-code-text" style="padding-right:16px">96</div><div data-line-number="97" class="react-line-number react-code-text" style="padding-right:16px">97</div><div data-line-number="98" class="react-line-number react-code-text" style="padding-right:16px">98</div><div data-line-number="99" class="react-line-number react-code-text" style="padding-right:16px">99</div><div data-line-number="100" class="react-line-number react-code-text" style="padding-right:16px">100</div><div data-line-number="101" class="react-line-number react-code-text" style="padding-right:16px">101</div><div data-line-number="102" class="react-line-number react-code-text" style="padding-right:16px">102</div><div data-line-number="103" class="react-line-number react-code-text" style="padding-right:16px">103</div><div data-line-number="104" class="react-line-number react-code-text" style="padding-right:16px">104</div><div data-line-number="105" class="react-line-number react-code-text" style="padding-right:16px">105</div><div data-line-number="106" class="react-line-number react-code-text" style="padding-right:16px">106</div><div data-line-number="107" class="react-line-number react-code-text" style="padding-right:16px">107</div><div data-line-number="108" class="react-line-number react-code-text" style="padding-right:16px">108</div><div data-line-number="109" class="react-line-number react-code-text" style="padding-right:16px">109</div><div data-line-number="110" class="react-line-number react-code-text" style="padding-right:16px">110</div><div data-line-number="111" class="react-line-number react-code-text" style="padding-right:16px">111</div><div data-line-number="112" class="react-line-number react-code-text" style="padding-right:16px">112</div><div data-line-number="113" class="react-line-number react-code-text" style="padding-right:16px">113</div><div data-line-number="114" class="react-line-number react-code-text" style="padding-right:16px">114</div><div data-line-number="115" class="react-line-number react-code-text" style="padding-right:16px">115</div><div data-line-number="116" class="react-line-number react-code-text" style="padding-right:16px">116</div><div data-line-number="117" class="react-line-number react-code-text" style="padding-right:16px">117</div><div data-line-number="118" class="react-line-number react-code-text" style="padding-right:16px">118</div><div data-line-number="119" class="react-line-number react-code-text" style="padding-right:16px">119</div><div data-line-number="120" class="react-line-number react-code-text" style="padding-right:16px">120</div><div data-line-number="121" class="react-line-number react-code-text" style="padding-right:16px">121</div><div data-line-number="122" class="react-line-number react-code-text" style="padding-right:16px">122</div><div data-line-number="123" class="react-line-number react-code-text" style="padding-right:16px">123</div><div data-line-number="124" class="react-line-number react-code-text" style="padding-right:16px">124</div><div data-line-number="125" class="react-line-number react-code-text" style="padding-right:16px">125</div><div data-line-number="126" class="react-line-number react-code-text" style="padding-right:16px">126</div><div data-line-number="127" class="react-line-number react-code-text" style="padding-right:16px">127</div><div data-line-number="128" class="react-line-number react-code-text" style="padding-right:16px">128</div><div data-line-number="129" class="react-line-number react-code-text" style="padding-right:16px">129</div><div data-line-number="130" class="react-line-number react-code-text" style="padding-right:16px">130</div><div data-line-number="131" class="react-line-number react-code-text" style="padding-right:16px">131</div><div data-line-number="132" class="react-line-number react-code-text" style="padding-right:16px">132</div><div data-line-number="133" class="react-line-number react-code-text" style="padding-right:16px">133</div><div data-line-number="134" class="react-line-number react-code-text" style="padding-right:16px">134</div><div data-line-number="135" class="react-line-number react-code-text" style="padding-right:16px">135</div><div data-line-number="136" class="react-line-number react-code-text" style="padding-right:16px">136</div><div data-line-number="137" class="react-line-number react-code-text" style="padding-right:16px">137</div><div data-line-number="138" class="react-line-number react-code-text" style="padding-right:16px">138</div><div data-line-number="139" class="react-line-number react-code-text" style="padding-right:16px">139</div><div data-line-number="140" class="react-line-number react-code-text" style="padding-right:16px">140</div><div data-line-number="141" class="react-line-number react-code-text" style="padding-right:16px">141</div><div data-line-number="142" class="react-line-number react-code-text" style="padding-right:16px">142</div><div data-line-number="143" class="react-line-number react-code-text" style="padding-right:16px">143</div><div data-line-number="144" class="react-line-number react-code-text" style="padding-right:16px">144</div><div data-line-number="145" class="react-line-number react-code-text" style="padding-right:16px">145</div><div data-line-number="146" class="react-line-number react-code-text" style="padding-right:16px">146</div><div data-line-number="147" class="react-line-number react-code-text" style="padding-right:16px">147</div><div data-line-number="148" class="react-line-number react-code-text" style="padding-right:16px">148</div><div data-line-number="149" class="react-line-number react-code-text" style="padding-right:16px">149</div><div data-line-number="150" class="react-line-number react-code-text" style="padding-right:16px">150</div><div data-line-number="151" class="react-line-number react-code-text" style="padding-right:16px">151</div><div data-line-number="152" class="react-line-number react-code-text" style="padding-right:16px">152</div><div data-line-number="153" class="react-line-number react-code-text" style="padding-right:16px">153</div><div data-line-number="154" class="react-line-number react-code-text" style="padding-right:16px">154</div><div data-line-number="155" class="react-line-number react-code-text" style="padding-right:16px">155</div><div data-line-number="156" class="react-line-number react-code-text" style="padding-right:16px">156</div><div data-line-number="157" class="react-line-number react-code-text" style="padding-right:16px">157</div><div data-line-number="158" class="react-line-number react-code-text" style="padding-right:16px">158</div><div data-line-number="159" class="react-line-number react-code-text" style="padding-right:16px">159</div><div data-line-number="160" class="react-line-number react-code-text" style="padding-right:16px">160</div><div data-line-number="161" class="react-line-number react-code-text" style="padding-right:16px">161</div><div data-line-number="162" class="react-line-number react-code-text" style="padding-right:16px">162</div><div data-line-number="163" class="react-line-number react-code-text" style="padding-right:16px">163</div><div data-line-number="164" class="react-line-number react-code-text" style="padding-right:16px">164</div><div data-line-number="165" class="react-line-number react-code-text" style="padding-right:16px">165</div><div data-line-number="166" class="react-line-number react-code-text" style="padding-right:16px">166</div><div data-line-number="167" class="react-line-number react-code-text" style="padding-right:16px">167</div><div data-line-number="168" class="react-line-number react-code-text" style="padding-right:16px">168</div><div data-line-number="169" class="react-line-number react-code-text" style="padding-right:16px">169</div></div><div class="react-code-lines"><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC1" class="react-file-line html-div" data-testid="code-cell" data-line-number="1" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">json</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC2" class="react-file-line html-div" data-testid="code-cell" data-line-number="2" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC3" class="react-file-line html-div" data-testid="code-cell" data-line-number="3" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">gzip</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC4" class="react-file-line html-div" data-testid="code-cell" data-line-number="4" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">shutil</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC5" class="react-file-line html-div" data-testid="code-cell" data-line-number="5" style="position:relative"><span class="pl-k">from</span> <span class="pl-s1">requetes</span> <span class="pl-k">import</span> <span class="pl-s1">extraire_requetes_longues</span>, <span class="pl-s1">extraire_requetes_courtes</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC6" class="react-file-line html-div" data-testid="code-cell" data-line-number="6" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC7" class="react-file-line html-div" data-testid="code-cell" data-line-number="7" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC8" class="react-file-line html-div" data-testid="code-cell" data-line-number="8" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">gzip</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC9" class="react-file-line html-div" data-testid="code-cell" data-line-number="9" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">shutil</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC10" class="react-file-line html-div" data-testid="code-cell" data-line-number="10" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC11" class="react-file-line html-div" data-testid="code-cell" data-line-number="11" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC12" class="react-file-line html-div" data-testid="code-cell" data-line-number="12" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">gzip</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC13" class="react-file-line html-div" data-testid="code-cell" data-line-number="13" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">shutil</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC14" class="react-file-line html-div" data-testid="code-cell" data-line-number="14" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC15" class="react-file-line html-div" data-testid="code-cell" data-line-number="15" style="position:relative"><span class="pl-c"># Chemins des dossiers</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC16" class="react-file-line html-div" data-testid="code-cell" data-line-number="16" style="position:relative"><span class="pl-s1">input_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC17" class="react-file-line html-div" data-testid="code-cell" data-line-number="17" style="position:relative"><span class="pl-s1">output_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP_ok&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC18" class="react-file-line html-div" data-testid="code-cell" data-line-number="18" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC19" class="react-file-line html-div" data-testid="code-cell" data-line-number="19" style="position:relative"><span class="pl-c"># Vérifie que le dossier de sortie existe, sinon le crée</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC20" class="react-file-line html-div" data-testid="code-cell" data-line-number="20" style="position:relative"><span class="pl-s1">os</span>.<span class="pl-c1">makedirs</span>(<span class="pl-s1">output_folder</span>, <span class="pl-s1">exist_ok</span><span class="pl-c1">=</span><span class="pl-c1">True</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC21" class="react-file-line html-div" data-testid="code-cell" data-line-number="21" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC22" class="react-file-line html-div" data-testid="code-cell" data-line-number="22" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC23" class="react-file-line html-div" data-testid="code-cell" data-line-number="23" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC24" class="react-file-line html-div" data-testid="code-cell" data-line-number="24" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC25" class="react-file-line html-div" data-testid="code-cell" data-line-number="25" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">chardet</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC26" class="react-file-line html-div" data-testid="code-cell" data-line-number="26" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC27" class="react-file-line html-div" data-testid="code-cell" data-line-number="27" style="position:relative"><span class="pl-c"># Chemins des dossiers</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC28" class="react-file-line html-div" data-testid="code-cell" data-line-number="28" style="position:relative"><span class="pl-s1">input_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP__ok&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC29" class="react-file-line html-div" data-testid="code-cell" data-line-number="29" style="position:relative"><span class="pl-s1">output_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP_fixed&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC30" class="react-file-line html-div" data-testid="code-cell" data-line-number="30" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC31" class="react-file-line html-div" data-testid="code-cell" data-line-number="31" style="position:relative"><span class="pl-c"># Crée le dossier de sortie s&#039;il n&#039;existe pas</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC32" class="react-file-line html-div" data-testid="code-cell" data-line-number="32" style="position:relative"><span class="pl-s1">os</span>.<span class="pl-c1">makedirs</span>(<span class="pl-s1">output_folder</span>, <span class="pl-s1">exist_ok</span><span class="pl-c1">=</span><span class="pl-c1">True</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC33" class="react-file-line html-div" data-testid="code-cell" data-line-number="33" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC34" class="react-file-line html-div" data-testid="code-cell" data-line-number="34" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC35" class="react-file-line html-div" data-testid="code-cell" data-line-number="35" style="position:relative"><span class="pl-c"># Fonction pour détecter l&#039;encodage</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC36" class="react-file-line html-div" data-testid="code-cell" data-line-number="36" style="position:relative"><span class="pl-k">def</span> <span class="pl-en">detect_encoding</span>(<span class="pl-s1">file_path</span>):</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC37" class="react-file-line html-div" data-testid="code-cell" data-line-number="37" style="position:relative">    <span class="pl-k">with</span> <span class="pl-en">open</span>(<span class="pl-s1">file_path</span>, <span class="pl-s">&#039;rb&#039;</span>) <span class="pl-k">as</span> <span class="pl-s1">f</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC38" class="react-file-line html-div" data-testid="code-cell" data-line-number="38" style="position:relative">        <span class="pl-s1">result</span> <span class="pl-c1">=</span> <span class="pl-s1">chardet</span>.<span class="pl-c1">detect</span>(<span class="pl-s1">f</span>.<span class="pl-c1">read</span>())</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC39" class="react-file-line html-div" data-testid="code-cell" data-line-number="39" style="position:relative">        <span class="pl-k">return</span> <span class="pl-s1">result</span>[<span class="pl-s">&#039;encoding&#039;</span>]</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC40" class="react-file-line html-div" data-testid="code-cell" data-line-number="40" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC41" class="react-file-line html-div" data-testid="code-cell" data-line-number="41" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC42" class="react-file-line html-div" data-testid="code-cell" data-line-number="42" style="position:relative"><span class="pl-c"># Fonction pour transformer un document en structure XML Solr</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC43" class="react-file-line html-div" data-testid="code-cell" data-line-number="43" style="position:relative"><span class="pl-k">def</span> <span class="pl-en">transform_document</span>(<span class="pl-s1">lines</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC44" class="react-file-line html-div" data-testid="code-cell" data-line-number="44" style="position:relative">    <span class="pl-s1">doc_lines</span> <span class="pl-c1">=</span> []</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC45" class="react-file-line html-div" data-testid="code-cell" data-line-number="45" style="position:relative">    <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">&quot;  &lt;doc&gt;&quot;</span>)  <span class="pl-c"># Début du document</span></div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC46" class="react-file-line html-div" data-testid="code-cell" data-line-number="46" style="position:relative">    <span class="pl-k">for</span> <span class="pl-s1">line</span> <span class="pl-c1">in</span> <span class="pl-s1">lines</span>:</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC47" class="react-file-line html-div" data-testid="code-cell" data-line-number="47" style="position:relative">        <span class="pl-s1">line</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC48" class="react-file-line html-div" data-testid="code-cell" data-line-number="48" style="position:relative">        <span class="pl-k">if</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;DOCNO&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC49" class="react-file-line html-div" data-testid="code-cell" data-line-number="49" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;DOCNO&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/DOCNO&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC50" class="react-file-line html-div" data-testid="code-cell" data-line-number="50" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>DOCNO<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC51" class="react-file-line html-div" data-testid="code-cell" data-line-number="51" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;FILEID&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC52" class="react-file-line html-div" data-testid="code-cell" data-line-number="52" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;FILEID&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/FILEID&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC53" class="react-file-line html-div" data-testid="code-cell" data-line-number="53" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>FILEID<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC54" class="react-file-line html-div" data-testid="code-cell" data-line-number="54" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;FIRST&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC55" class="react-file-line html-div" data-testid="code-cell" data-line-number="55" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;FIRST&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/FIRST&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC56" class="react-file-line html-div" data-testid="code-cell" data-line-number="56" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>FIRST<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC57" class="react-file-line html-div" data-testid="code-cell" data-line-number="57" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;SECOND&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC58" class="react-file-line html-div" data-testid="code-cell" data-line-number="58" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;SECOND&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/SECOND&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC59" class="react-file-line html-div" data-testid="code-cell" data-line-number="59" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>SECOND<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC60" class="react-file-line html-div" data-testid="code-cell" data-line-number="60" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;HEAD&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC61" class="react-file-line html-div" data-testid="code-cell" data-line-number="61" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;HEAD&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/HEAD&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC62" class="react-file-line html-div" data-testid="code-cell" data-line-number="62" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>HEAD<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC63" class="react-file-line html-div" data-testid="code-cell" data-line-number="63" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;DATELINE&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC64" class="react-file-line html-div" data-testid="code-cell" data-line-number="64" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;DATELINE&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/DATELINE&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC65" class="react-file-line html-div" data-testid="code-cell" data-line-number="65" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>DATELINE<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC66" class="react-file-line html-div" data-testid="code-cell" data-line-number="66" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;TEXT&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC67" class="react-file-line html-div" data-testid="code-cell" data-line-number="67" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;TEXT&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/TEXT&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC68" class="react-file-line html-div" data-testid="code-cell" data-line-number="68" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>TEXT<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC69" class="react-file-line html-div" data-testid="code-cell" data-line-number="69" style="position:relative">    <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">&quot;  &lt;/doc&gt;&quot;</span>)  <span class="pl-c"># Fin du document</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC70" class="react-file-line html-div" data-testid="code-cell" data-line-number="70" style="position:relative">    <span class="pl-k">return</span> <span class="pl-s1">doc_lines</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC71" class="react-file-line html-div" data-testid="code-cell" data-line-number="71" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC72" class="react-file-line html-div" data-testid="code-cell" data-line-number="72" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC73" class="react-file-line html-div" data-testid="code-cell" data-line-number="73" style="position:relative"><span class="pl-c"># Parcourt tous les fichiers XML dans le dossier d&#039;entrée</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC74" class="react-file-line html-div" data-testid="code-cell" data-line-number="74" style="position:relative"><span class="pl-k">for</span> <span class="pl-s1">filename</span> <span class="pl-c1">in</span> <span class="pl-s1">os</span>.<span class="pl-c1">listdir</span>(<span class="pl-s1">input_folder</span>):</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC75" class="react-file-line html-div" data-testid="code-cell" data-line-number="75" style="position:relative">    <span class="pl-k">if</span> <span class="pl-s1">filename</span>.<span class="pl-c1">endswith</span>(<span class="pl-s">&#039;.xml&#039;</span>):</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC76" class="react-file-line html-div" data-testid="code-cell" data-line-number="76" style="position:relative">        <span class="pl-s1">input_path</span> <span class="pl-c1">=</span> <span class="pl-s1">os</span>.<span class="pl-c1">path</span>.<span class="pl-c1">join</span>(<span class="pl-s1">input_folder</span>, <span class="pl-s1">filename</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC77" class="react-file-line html-div" data-testid="code-cell" data-line-number="77" style="position:relative">        <span class="pl-s1">output_path</span> <span class="pl-c1">=</span> <span class="pl-s1">os</span>.<span class="pl-c1">path</span>.<span class="pl-c1">join</span>(<span class="pl-s1">output_folder</span>, <span class="pl-s1">filename</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC78" class="react-file-line html-div" data-testid="code-cell" data-line-number="78" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC79" class="react-file-line html-div" data-testid="code-cell" data-line-number="79" style="position:relative">        <span class="pl-k">try</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC80" class="react-file-line html-div" data-testid="code-cell" data-line-number="80" style="position:relative">            <span class="pl-c"># Détecte l&#039;encodage</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC81" class="react-file-line html-div" data-testid="code-cell" data-line-number="81" style="position:relative">            <span class="pl-s1">encoding</span> <span class="pl-c1">=</span> <span class="pl-en">detect_encoding</span>(<span class="pl-s1">input_path</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC82" class="react-file-line html-div" data-testid="code-cell" data-line-number="82" style="position:relative">            <span class="pl-en">print</span>(<span class="pl-s">f&quot;Encodage détecté pour <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">filename</span><span class="pl-kos">}</span></span>: <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">encoding</span><span class="pl-kos">}</span></span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC83" class="react-file-line html-div" data-testid="code-cell" data-line-number="83" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC84" class="react-file-line html-div" data-testid="code-cell" data-line-number="84" style="position:relative">            <span class="pl-c"># Lit le fichier avec l&#039;encodage détecté</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC85" class="react-file-line html-div" data-testid="code-cell" data-line-number="85" style="position:relative">            <span class="pl-k">with</span> <span class="pl-en">open</span>(<span class="pl-s1">input_path</span>, <span class="pl-s">&#039;r&#039;</span>, <span class="pl-s1">encoding</span><span class="pl-c1">=</span><span class="pl-s1">encoding</span>, <span class="pl-s1">errors</span><span class="pl-c1">=</span><span class="pl-s">&#039;ignore&#039;</span>) <span class="pl-k">as</span> <span class="pl-s1">file</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC86" class="react-file-line html-div" data-testid="code-cell" data-line-number="86" style="position:relative">                <span class="pl-s1">lines</span> <span class="pl-c1">=</span> <span class="pl-s1">file</span>.<span class="pl-c1">readlines</span>()</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC87" class="react-file-line html-div" data-testid="code-cell" data-line-number="87" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC88" class="react-file-line html-div" data-testid="code-cell" data-line-number="88" style="position:relative">            <span class="pl-c"># Transforme et écrit dans le fichier de sortie</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC89" class="react-file-line html-div" data-testid="code-cell" data-line-number="89" style="position:relative">            <span class="pl-k">with</span> <span class="pl-en">open</span>(<span class="pl-s1">output_path</span>, <span class="pl-s">&#039;w&#039;</span>, <span class="pl-s1">encoding</span><span class="pl-c1">=</span><span class="pl-s">&#039;utf-8&#039;</span>) <span class="pl-k">as</span> <span class="pl-s1">out_file</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC90" class="react-file-line html-div" data-testid="code-cell" data-line-number="90" style="position:relative">                <span class="pl-s1">out_file</span>.<span class="pl-c1">write</span>(<span class="pl-s">&quot;&lt;add&gt;<span class="pl-cce">\n</span>&quot;</span>)  <span class="pl-c"># Début de la racine Solr</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC91" class="react-file-line html-div" data-testid="code-cell" data-line-number="91" style="position:relative">                <span class="pl-s1">current_doc</span> <span class="pl-c1">=</span> []</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC92" class="react-file-line html-div" data-testid="code-cell" data-line-number="92" style="position:relative">                <span class="pl-s1">in_doc</span> <span class="pl-c1">=</span> <span class="pl-c1">False</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC93" class="react-file-line html-div" data-testid="code-cell" data-line-number="93" style="position:relative">                <span class="pl-k">for</span> <span class="pl-s1">line</span> <span class="pl-c1">in</span> <span class="pl-s1">lines</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC94" class="react-file-line html-div" data-testid="code-cell" data-line-number="94" style="position:relative">                    <span class="pl-k">if</span> <span class="pl-s">&quot;&lt;DOC&gt;&quot;</span> <span class="pl-c1">in</span> <span class="pl-s1">line</span>:  <span class="pl-c"># Début d&#039;un document</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC95" class="react-file-line html-div" data-testid="code-cell" data-line-number="95" style="position:relative">                        <span class="pl-s1">in_doc</span> <span class="pl-c1">=</span> <span class="pl-c1">True</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC96" class="react-file-line html-div" data-testid="code-cell" data-line-number="96" style="position:relative">                        <span class="pl-s1">current_doc</span> <span class="pl-c1">=</span> []</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC97" class="react-file-line html-div" data-testid="code-cell" data-line-number="97" style="position:relative">                    <span class="pl-k">elif</span> <span class="pl-s">&quot;&lt;/DOC&gt;&quot;</span> <span class="pl-c1">in</span> <span class="pl-s1">line</span>:  <span class="pl-c"># Fin d&#039;un document</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC98" class="react-file-line html-div" data-testid="code-cell" data-line-number="98" style="position:relative">                        <span class="pl-s1">in_doc</span> <span class="pl-c1">=</span> <span class="pl-c1">False</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC99" class="react-file-line html-div" data-testid="code-cell" data-line-number="99" style="position:relative">                        <span class="pl-c"># Transforme le document et l&#039;écrit dans le fichier</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC100" class="react-file-line html-div" data-testid="code-cell" data-line-number="100" style="position:relative">                        <span class="pl-s1">transformed_doc</span> <span class="pl-c1">=</span> <span class="pl-en">transform_document</span>(<span class="pl-s1">current_doc</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC101" class="react-file-line html-div" data-testid="code-cell" data-line-number="101" style="position:relative">                        <span class="pl-s1">out_file</span>.<span class="pl-c1">write</span>(<span class="pl-s">&quot;<span class="pl-cce">\n</span>&quot;</span>.<span class="pl-c1">join</span>(<span class="pl-s1">transformed_doc</span>) <span class="pl-c1">+</span> <span class="pl-s">&quot;<span class="pl-cce">\n</span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC102" class="react-file-line html-div" data-testid="code-cell" data-line-number="102" style="position:relative">                    <span class="pl-k">elif</span> <span class="pl-s1">in_doc</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC103" class="react-file-line html-div" data-testid="code-cell" data-line-number="103" style="position:relative">                        <span class="pl-s1">current_doc</span>.<span class="pl-c1">append</span>(<span class="pl-s1">line</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC104" class="react-file-line html-div" data-testid="code-cell" data-line-number="104" style="position:relative">                <span class="pl-s1">out_file</span>.<span class="pl-c1">write</span>(<span class="pl-s">&quot;&lt;/add&gt;<span class="pl-cce">\n</span>&quot;</span>)  <span class="pl-c"># Fin de la racine Solr</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC105" class="react-file-line html-div" data-testid="code-cell" data-line-number="105" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC106" class="react-file-line html-div" data-testid="code-cell" data-line-number="106" style="position:relative">            <span class="pl-en">print</span>(<span class="pl-s">f&quot;Fichier corrigé et converti : <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">output_path</span><span class="pl-kos">}</span></span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC107" class="react-file-line html-div" data-testid="code-cell" data-line-number="107" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC108" class="react-file-line html-div" data-testid="code-cell" data-line-number="108" style="position:relative">        <span class="pl-k">except</span> <span class="pl-v">Exception</span> <span class="pl-k">as</span> <span class="pl-s1">e</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC109" class="react-file-line html-div" data-testid="code-cell" data-line-number="109" style="position:relative">            <span class="pl-en">print</span>(<span class="pl-s">f&quot;Erreur lors du traitement de <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">filename</span><span class="pl-kos">}</span></span>: <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">e</span><span class="pl-kos">}</span></span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC110" class="react-file-line html-div" data-testid="code-cell" data-line-number="110" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC111" class="react-file-line html-div" data-testid="code-cell" data-line-number="111" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC112" class="react-file-line html-div" data-testid="code-cell" data-line-number="112" style="position:relative"><span class="pl-s"># Parcourt tous les fichiers du dossier d&#039;entrée</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC113" class="react-file-line html-div" data-testid="code-cell" data-line-number="113" style="position:relative"><span class="pl-s">for filename in os.listdir(input_folder):</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC114" class="react-file-line html-div" data-testid="code-cell" data-line-number="114" style="position:relative"><span class="pl-s">    if filename.endswith(&#039;.gz&#039;):  # Vérifie si le fichier est au format .gz</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC115" class="react-file-line html-div" data-testid="code-cell" data-line-number="115" style="position:relative"><span class="pl-s">        input_path = os.path.join(input_folder, filename)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC116" class="react-file-line html-div" data-testid="code-cell" data-line-number="116" style="position:relative"><span class="pl-s">        output_path = os.path.join(output_folder, filename[:-3] + &#039;.xml&#039;)  # Supprime &#039;.gz&#039; et ajoute &#039;.xml&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC117" class="react-file-line html-div" data-testid="code-cell" data-line-number="117" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC118" class="react-file-line html-div" data-testid="code-cell" data-line-number="118" style="position:relative"><span class="pl-s">        # Décompresse le fichier et l&#039;écrit directement avec l&#039;extension .xml</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC119" class="react-file-line html-div" data-testid="code-cell" data-line-number="119" style="position:relative"><span class="pl-s">        with gzip.open(input_path, &#039;rb&#039;) as f_in:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC120" class="react-file-line html-div" data-testid="code-cell" data-line-number="120" style="position:relative"><span class="pl-s">            with open(output_path, &#039;wb&#039;) as f_out:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC121" class="react-file-line html-div" data-testid="code-cell" data-line-number="121" style="position:relative"><span class="pl-s">                shutil.copyfileobj(f_in, f_out)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC122" class="react-file-line html-div" data-testid="code-cell" data-line-number="122" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC123" class="react-file-line html-div" data-testid="code-cell" data-line-number="123" style="position:relative"><span class="pl-s">        print(f&quot;Fichier extrait et enregistré en XML : {output_path}&quot;)&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC124" class="react-file-line html-div" data-testid="code-cell" data-line-number="124" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC125" class="react-file-line html-div" data-testid="code-cell" data-line-number="125" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC126" class="react-file-line html-div" data-testid="code-cell" data-line-number="126" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC127" class="react-file-line html-div" data-testid="code-cell" data-line-number="127" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC128" class="react-file-line html-div" data-testid="code-cell" data-line-number="128" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC129" class="react-file-line html-div" data-testid="code-cell" data-line-number="129" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC130" class="react-file-line html-div" data-testid="code-cell" data-line-number="130" style="position:relative"><span class="pl-s"># Creation des requetes longues</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC131" class="react-file-line html-div" data-testid="code-cell" data-line-number="131" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC132" class="react-file-line html-div" data-testid="code-cell" data-line-number="132" style="position:relative"><span class="pl-s"># Liste des fichiers à traiter</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC133" class="react-file-line html-div" data-testid="code-cell" data-line-number="133" style="position:relative"><span class="pl-s">files = [&#039;Topics-requetes/topics.1-50.txt&#039;, &#039;Topics-requetes/topics.51-100.txt&#039;, &#039;Topics-requetes/topics.101-150.txt&#039;]</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC134" class="react-file-line html-div" data-testid="code-cell" data-line-number="134" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC135" class="react-file-line html-div" data-testid="code-cell" data-line-number="135" style="position:relative"><span class="pl-s"># Dictionnaire pour stocker les résultats</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC136" class="react-file-line html-div" data-testid="code-cell" data-line-number="136" style="position:relative"><span class="pl-s">def lire_fichier(filepath):</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC137" class="react-file-line html-div" data-testid="code-cell" data-line-number="137" style="position:relative"><span class="pl-s">    with open(filepath, &#039;r&#039;, encoding=&#039;utf-8&#039;) as file:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC138" class="react-file-line html-div" data-testid="code-cell" data-line-number="138" style="position:relative"><span class="pl-s">        return file.read()</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC139" class="react-file-line html-div" data-testid="code-cell" data-line-number="139" style="position:relative"><span class="pl-s">req_longues_combines = {}</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC140" class="react-file-line html-div" data-testid="code-cell" data-line-number="140" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC141" class="react-file-line html-div" data-testid="code-cell" data-line-number="141" style="position:relative"><span class="pl-s">req_courtes_combines = {}</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC142" class="react-file-line html-div" data-testid="code-cell" data-line-number="142" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC143" class="react-file-line html-div" data-testid="code-cell" data-line-number="143" style="position:relative"><span class="pl-s">for fichier in files:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC144" class="react-file-line html-div" data-testid="code-cell" data-line-number="144" style="position:relative"><span class="pl-s">    data = lire_fichier(fichier)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC145" class="react-file-line html-div" data-testid="code-cell" data-line-number="145" style="position:relative"><span class="pl-s">    resultat = extraire_requetes_longues(data)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC146" class="react-file-line html-div" data-testid="code-cell" data-line-number="146" style="position:relative"><span class="pl-s">    req_longues_combines.update(resultat)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC147" class="react-file-line html-div" data-testid="code-cell" data-line-number="147" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC148" class="react-file-line html-div" data-testid="code-cell" data-line-number="148" style="position:relative"><span class="pl-s">for fichier in files:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC149" class="react-file-line html-div" data-testid="code-cell" data-line-number="149" style="position:relative"><span class="pl-s">    data = lire_fichier(fichier)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC150" class="react-file-line html-div" data-testid="code-cell" data-line-number="150" style="position:relative"><span class="pl-s">    resultat = extraire_requetes_courtes(data)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC151" class="react-file-line html-div" data-testid="code-cell" data-line-number="151" style="position:relative"><span class="pl-s">    req_courtes_combines.update(resultat)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC152" class="react-file-line html-div" data-testid="code-cell" data-line-number="152" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC153" class="react-file-line html-div" data-testid="code-cell" data-line-number="153" style="position:relative"><span class="pl-c"># Afficher les résultats combinés</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC154" class="react-file-line html-div" data-testid="code-cell" data-line-number="154" style="position:relative"><span class="pl-c">#print(resultats_combines)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC155" class="react-file-line html-div" data-testid="code-cell" data-line-number="155" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC156" class="react-file-line html-div" data-testid="code-cell" data-line-number="156" style="position:relative"><span class="pl-c"># Convertion en fichier json</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC157" class="react-file-line html-div" data-testid="code-cell" data-line-number="157" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC158" class="react-file-line html-div" data-testid="code-cell" data-line-number="158" style="position:relative"><span class="pl-c"># Conversion en fichier JSON</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC159" class="react-file-line html-div" data-testid="code-cell" data-line-number="159" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC160" class="react-file-line html-div" data-testid="code-cell" data-line-number="160" style="position:relative"><span class="pl-s">with open(&#039;requetes/requetes_longues.json&#039;, &#039;w&#039;, encoding=&#039;utf-8&#039;) as fichier_json:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC161" class="react-file-line html-div" data-testid="code-cell" data-line-number="161" style="position:relative"><span class="pl-s">    json.dump( req_longues_combines, fichier_json, ensure_ascii=False, indent=4)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC162" class="react-file-line html-div" data-testid="code-cell" data-line-number="162" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC163" class="react-file-line html-div" data-testid="code-cell" data-line-number="163" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC164" class="react-file-line html-div" data-testid="code-cell" data-line-number="164" style="position:relative"><span class="pl-s"># Creation des requetes courtes</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC165" class="react-file-line html-div" data-testid="code-cell" data-line-number="165" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC166" class="react-file-line html-div" data-testid="code-cell" data-line-number="166" style="position:relative"><span class="pl-s">with open(&#039;requetes/requetes_courtes.json&#039;, &#039;w&#039;, encoding=&#039;utf-8&#039;) as fichier_json:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC167" class="react-file-line html-div" data-testid="code-cell" data-line-number="167" style="position:relative"><span class="pl-s">    json.dump( req_courtes_combines, fichier_json, ensure_ascii=False, indent=4)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC168" class="react-file-line html-div" data-testid="code-cell" data-line-number="168" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC169" class="react-file-line html-div" data-testid="code-cell" data-line-number="169" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div></div></div></div><div id="copilot-button-container"></div></div><div id="highlighted-line-menu-container"></div></div></div><button hidden="" data-testid="hotkey-button" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button></section></div></div><div class="Box-sc-g0xbh4-0 mgQhK"></div><div class="Box-sc-g0xbh4-0 ipeRWy panel-content-narrow-styles inner-panel-content-not-narrow"><div id="symbols-pane"><div aria-labelledby="symbols-pane-header" class="Box-sc-g0xbh4-0 cxUsTr"><div class="Box-sc-g0xbh4-0 jXkPPw"><h2 id="symbols-pane-header" tabindex="-1" class="Box-sc-g0xbh4-0 hECgeo">Symbols</h2><button data-component="IconButton" type="button" aria-label="Close symbols" data-hotkey="Escape" class="Box-sc-g0xbh4-0 fotqAA prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="invisible" aria-describedby=":R12qtal9lab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-x" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path></svg></button></div><div class="Box-sc-g0xbh4-0 hoyhab">Find definitions and references for functions and other symbols in this file by clicking a symbol below or in the code.</div><span class="TextInputWrapper__StyledTextInputBaseWrapper-sc-1mqhpbi-0 zEBjf TextInputWrapper__StyledTextInputWrapper-sc-1mqhpbi-1 kOcqDw TextInput-wrapper" data-block="true" data-trailing-action="true" data-leading-visual="true" data-trailing-visual="true" aria-busy="false"><span class="TextInput-icon" id=":R6qtal9lab:" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="Octicon-sc-9kayk9-0" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M.75 3h14.5a.75.75 0 0 1 0 1.5H.75a.75.75 0 0 1 0-1.5ZM3 7.75A.75.75 0 0 1 3.75 7h8.5a.75.75 0 0 1 0 1.5h-8.5A.75.75 0 0 1 3 7.75Zm3 4a.75.75 0 0 1 .75-.75h2.5a.75.75 0 0 1 0 1.5h-2.5a.75.75 0 0 1-.75-.75Z"></path></svg></span><input type="text" placeholder="Filter symbols" name="Filter symbols" aria-label="Filter symbols" aria-controls="filter-results" aria-expanded="true" aria-autocomplete="list" role="combobox" aria-describedby=":R6qtal9lab: :R6qtal9labH1:" data-component="input" class="UnstyledTextInput__ToggledUnstyledTextInput-sc-14ypya-0 jkNcAv" value=""/><span class="TextInput-icon" id=":R6qtal9labH1:" aria-hidden="true"><div class="Box-sc-g0xbh4-0 gqhZpQ"><kbd>r</kbd></div></span></span><div class="Box-sc-g0xbh4-0 ccgkJf"><div id="filter-results" class="Box-sc-g0xbh4-0 kACRto"><span role="status" aria-live="polite" aria-atomic="true" class="_VisuallyHidden__VisuallyHidden-sc-11jhm7a-0 brGdpi"></span><ul role="tree" aria-label="Code Navigation" data-omit-spacer="true" data-truncate-text="true" class="TreeView__UlBox-sc-4ex6b6-0 cJWUiG"><li class="PRIVATE_TreeView-item" tabindex="0" id="0input_folder" role="treeitem" aria-labelledby=":R38qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R38qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="input_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>input_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="1output_folder" role="treeitem" aria-labelledby=":R58qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R58qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="output_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>output_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="2input_folder" role="treeitem" aria-labelledby=":R78qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R78qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="input_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>input_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="3output_folder" role="treeitem" aria-labelledby=":R98qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R98qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="output_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>output_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="4detect_encoding" role="treeitem" aria-labelledby=":Rb8qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rb8qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 gxAxAi"></div><div class="Box-sc-g0xbh4-0 gWkFIQ">func</div></div>  <div title="detect_encoding" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>detect_encoding</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="5transform_document" role="treeitem" aria-labelledby=":Rd8qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rd8qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 gxAxAi"></div><div class="Box-sc-g0xbh4-0 gWkFIQ">func</div></div>  <div title="transform_document" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>transform_document</span></div></div></span></div></div></li></ul></div></div></div></div></div></div> <!-- --> <!-- --> </div></div></div><div class="Box-sc-g0xbh4-0"></div></div></div></div></div><div id="find-result-marks-container" class="Box-sc-g0xbh4-0 cCoXib"></div><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button></div> <!-- --> <!-- --> <script type="application/json" id="__PRIMER_DATA_:R0:__">{"resolvedServerColorMode":"night"}</script></div>
</react-app>
</turbo-frame>



  </div>

</turbo-frame>

    </main>
  </div>

  </div>

          <footer class="footer pt-8 pb-6 f6 color-fg-muted p-responsive" role="contentinfo" >
  <h2 class='sr-only'>Footer</h2>

  


  <div class="d-flex flex-justify-center flex-items-center flex-column-reverse flex-lg-row flex-wrap flex-lg-nowrap">
    <div class="d-flex flex-items-center flex-shrink-0 mx-2">
      <a aria-label="Homepage" title="GitHub" class="footer-octicon mr-2" href="https://github.com">
        <svg aria-hidden="true" height="24" viewBox="0 0 24 24" version="1.1" width="24" data-view-component="true" class="octicon octicon-mark-github">
    <path d="M12.5.75C6.146.75 1 5.896 1 12.25c0 5.089 3.292 9.387 7.863 10.91.575.101.79-.244.79-.546 0-.273-.014-1.178-.014-2.142-2.889.532-3.636-.704-3.866-1.35-.13-.331-.69-1.352-1.18-1.625-.402-.216-.977-.748-.014-.762.906-.014 1.553.834 1.769 1.179 1.035 1.74 2.688 1.25 3.349.948.1-.747.402-1.25.733-1.538-2.559-.287-5.232-1.279-5.232-5.678 0-1.25.445-2.285 1.178-3.09-.115-.288-.517-1.467.115-3.048 0 0 .963-.302 3.163 1.179.92-.259 1.897-.388 2.875-.388.977 0 1.955.13 2.875.388 2.2-1.495 3.162-1.179 3.162-1.179.633 1.581.23 2.76.115 3.048.733.805 1.179 1.825 1.179 3.09 0 4.413-2.688 5.39-5.247 5.678.417.36.776 1.05.776 2.128 0 1.538-.014 2.774-.014 3.162 0 .302.216.662.79.547C20.709 21.637 24 17.324 24 12.25 24 5.896 18.854.75 12.5.75Z"></path>
</svg>
</a>
      <span>
        &copy; 2025 GitHub,&nbsp;Inc.
      </span>
    </div>

    <nav aria-label="Footer">
      <h3 class="sr-only" id="sr-footer-heading">Footer navigation</h3>

      <ul class="list-style-none d-flex flex-justify-center flex-wrap mb-2 mb-lg-0" aria-labelledby="sr-footer-heading">

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to Terms&quot;,&quot;label&quot;:&quot;text:terms&quot;}" href="https://docs.github.com/site-policy/github-terms/github-terms-of-service" data-view-component="true" class="Link--secondary Link">Terms</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to privacy&quot;,&quot;label&quot;:&quot;text:privacy&quot;}" href="https://docs.github.com/site-policy/privacy-policies/github-privacy-statement" data-view-component="true" class="Link--secondary Link">Privacy</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to security&quot;,&quot;label&quot;:&quot;text:security&quot;}" href="https://github.com/security" data-view-component="true" class="Link--secondary Link">Security</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to status&quot;,&quot;label&quot;:&quot;text:status&quot;}" href="https://www.githubstatus.com/" data-view-component="true" class="Link--secondary Link">Status</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to docs&quot;,&quot;label&quot;:&quot;text:docs&quot;}" href="https://docs.github.com/" data-view-component="true" class="Link--secondary Link">Docs</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to contact&quot;,&quot;label&quot;:&quot;text:contact&quot;}" href="https://support.github.com?tags=dotcom-footer" data-view-component="true" class="Link--secondary Link">Contact</a>
          </li>

          <li class="mx-2" >
  <cookie-consent-link>
    <button
      type="button"
      class="Link--secondary underline-on-hover border-0 p-0 color-bg-transparent"
      data-action="click:cookie-consent-link#showConsentManagement"
      data-analytics-event="{&quot;location&quot;:&quot;footer&quot;,&quot;action&quot;:&quot;cookies&quot;,&quot;context&quot;:&quot;subfooter&quot;,&quot;tag&quot;:&quot;link&quot;,&quot;label&quot;:&quot;cookies_link_subfooter_footer&quot;}"
    >
      Manage cookies
    </button>
  </cookie-consent-link>
</li>

<li class="mx-2">
  <cookie-consent-link>
    <button
      type="button"
      class="Link--secondary underline-on-hover border-0 p-0 color-bg-transparent"
      data-action="click:cookie-consent-link#showConsentManagement"
      data-analytics-event="{&quot;location&quot;:&quot;footer&quot;,&quot;action&quot;:&quot;dont_share_info&quot;,&quot;context&quot;:&quot;subfooter&quot;,&quot;tag&quot;:&quot;link&quot;,&quot;label&quot;:&quot;dont_share_info_link_subfooter_footer&quot;}"
    >
      Do not share my personal information
    </button>
  </cookie-consent-link>
</li>

      </ul>
    </nav>
  </div>
</footer>



    <ghcc-consent id="ghcc" class="position-fixed bottom-0 left-0" style="z-index: 999999" data-initial-cookie-consent-allowed="" data-cookie-consent-required="true"></ghcc-consent>



  <div id="ajax-error-message" class="ajax-error-message flash flash-error" hidden>
    <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-alert">
    <path d="M6.457 1.047c.659-1.234 2.427-1.234 3.086 0l6.082 11.378A1.75 1.75 0 0 1 14.082 15H1.918a1.75 1.75 0 0 1-1.543-2.575Zm1.763.707a.25.25 0 0 0-.44 0L1.698 13.132a.25.25 0 0 0 .22.368h12.164a.25.25 0 0 0 .22-.368Zm.53 3.996v2.5a.75.75 0 0 1-1.5 0v-2.5a.75.75 0 0 1 1.5 0ZM9 11a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
    <button type="button" class="flash-close js-ajax-error-dismiss" aria-label="Dismiss error">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
    </button>
    You can’t perform that action at this time.
  </div>

    <template id="site-details-dialog">
  <details class="details-reset details-overlay details-overlay-dark lh-default color-fg-default hx_rsm" open>
    <summary role="button" aria-label="Close dialog"></summary>
    <details-dialog class="Box Box--overlay d-flex flex-column anim-fade-in fast hx_rsm-dialog hx_rsm-modal">
      <button class="Box-btn-octicon m-0 btn-octicon position-absolute right-0 top-0" type="button" aria-label="Close dialog" data-close-dialog>
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
      </button>
      <div class="octocat-spinner my-6 js-details-dialog-spinner"></div>
    </details-dialog>
  </details>
</template>

    <div class="Popover js-hovercard-content position-absolute" style="display: none; outline: none;">
  <div class="Popover-message Popover-message--bottom-left Popover-message--large Box color-shadow-large" style="width:360px;">
  </div>
</div>

    <template id="snippet-clipboard-copy-button">
  <div class="zeroclipboard-container position-absolute right-0 top-0">
    <clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0" data-copy-feedback="Copied!" data-tooltip-direction="w">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
    <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-check js-clipboard-check-icon color-fg-success d-none m-2">
    <path d="M13.78 4.22a.75.75 0 0 1 0 1.06l-7.25 7.25a.75.75 0 0 1-1.06 0L2.22 9.28a.751.751 0 0 1 .018-1.042.751.751 0 0 1 1.042-.018L6 10.94l6.72-6.72a.75.75 0 0 1 1.06 0Z"></path>
</svg>
    </clipboard-copy>
  </div>
</template>
<template id="snippet-clipboard-copy-button-unpositioned">
  <div class="zeroclipboard-container">
    <clipboard-copy aria-label="Copy" class="ClipboardButton btn btn-invisible js-clipboard-copy m-2 p-0 d-flex flex-justify-center flex-items-center" data-copy-feedback="Copied!" data-tooltip-direction="w">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon">
    <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-check js-clipboard-check-icon color-fg-success d-none">
    <path d="M13.78 4.22a.75.75 0 0 1 0 1.06l-7.25 7.25a.75.75 0 0 1-1.06 0L2.22 9.28a.751.751 0 0 1 .018-1.042.751.751 0 0 1 1.042-.018L6 10.94l6.72-6.72a.75.75 0 0 1 1.06 0Z"></path>
</svg>
    </clipboard-copy>
  </div>
</template>


    <style>
      .user-mention[href$="/DominiqueLoyer"] {
        color: var(--color-user-mention-fg);
        background-color: var(--bgColor-attention-muted, var(--color-attention-subtle));
        border-radius: 2px;
        margin-left: -2px;
        margin-right: -2px;
      }
      .user-mention[href$="/DominiqueLoyer"]:before,
      .user-mention[href$="/DominiqueLoyer"]:after {
        content: '';
        display: inline-block;
        width: 2px;
      }
    </style>


    </div>

    <div id="js-global-screen-reader-notice" class="sr-only mt-n1" aria-live="polite" aria-atomic="true" ></div>
    <div id="js-global-screen-reader-notice-assertive" class="sr-only mt-n1" aria-live="assertive" aria-atomic="true"></div>
  </body>
</html>

Pour citer ce code :

Loyer, Dominique. (2024). extraire_fichiers copie (trashed 2025-05-30 11-48-03) copie.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

extraire_fichiers copie (trashed 2025-05-30 11-48-03).py

Erreur lors de la génération de la description.

Mots-clés: erreur, api







<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="dark" data-light-theme="light" data-dark-theme="dark"
  data-a11y-animated-images="system" data-a11y-link-underlines="true"
  
  >



  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">
  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>
  <link rel="preconnect" href="https://avatars.githubusercontent.com">

  


  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-bd1cb5575fff.css" /><link data-color-theme="light" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light-605318cbe3a1.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_dimmed-52a2075571c3.css" /><link data-color-theme="dark_high_contrast" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_high_contrast-bf3988586de0.css" /><link data-color-theme="dark_colorblind" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_colorblind-27a437876a92.css" /><link data-color-theme="light_colorblind" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light_colorblind-97f0dc959f8f.css" /><link data-color-theme="light_high_contrast" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light_high_contrast-708e3a93215a.css" /><link data-color-theme="light_tritanopia" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/light_tritanopia-9217138a8d5b.css" /><link data-color-theme="dark_tritanopia" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https://github.githubassets.com/assets/dark_tritanopia-4397d91bdb49.css" />

    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-primitives-225433424a87.css" />
    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-93aded0ee8a1.css" />
    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/global-21a7f868f707.css" />
    <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/github-15d4b28ab680.css" />
  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/repository-4fce88777fa8.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/code-0210be90f4d3.css" />

  


  <script type="application/json" id="client-env">{"locale":"en","featureFlags":["a11y_quote_reply_fix","allow_subscription_halted_error","contentful_lp_optimize_image","contentful_lp_hero_video_cover_image","copilot_immersive_file_preview","copilot_new_references_ui","copilot_chat_repo_custom_instructions_preview","copilot_chat_immersive_subthreading","copilot_chat_retry_on_error","copilot_chat_opening_thread_switch","copilot_chat_persist_submitted_input","copilot_conversational_ux_history_refs","copilot_chat_shared_chat_input","copilot_editor_upsells","copilot_implicit_context","copilot_no_floating_button","copilot_smell_icebreaker_ux","copilot_spaces_multi_file_picker","copilot_read_shared_conversation","dotcom_chat_client_side_skills","experimentation_azure_variant_endpoint","failbot_handle_non_errors","geojson_azure_maps","ghost_pilot_confidence_truncation_25","ghost_pilot_confidence_truncation_40","github_models_gateway","github_models_o3_mini_streaming","hovercard_accessibility","insert_before_patch","issues_advanced_search","issues_react_remove_placeholders","issues_react_blur_item_picker_on_close","issues_react_include_bots_in_pickers","marketing_pages_search_explore_provider","remove_child_patch","repository_suggester_elastic_search","sample_network_conn_type","swp_enterprise_contact_form","site_copilot_extensions_ga","site_copilot_extensions_hero","site_copilot_vscode_link_update","site_proxima_australia_update","issues_react_create_milestone","issues_react_cache_fix_workaround","lifecycle_label_name_updates","item_picker_new_select_panel","issues_react_assignee_warning"],"login":"DominiqueLoyer"}</script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/wp-runtime-f6db9e9dec0b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_oddbird_popover-polyfill_dist_popover_js-9da652f58479.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_arianotify-polyfill_ariaNotify-polyfill_js-node_modules_github_mi-3abb8f-d7e6bc799724.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_failbot_failbot_ts-4600dbf2d60a.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/environment-f04cb2a9fc8c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_primer_behaviors_dist_esm_index_mjs-0dbb79f97f8f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_selector-observer_dist_index_esm_js-f690fd9ae3d5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_relative-time-element_dist_index_js-f6da4b3fa34c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_auto-complete-element_dist_index_js-node_modules_github_catalyst_-8e9f78-a74b4e0a8a6b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_text-expander-element_dist_index_js-78748950cb0c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_filter-input-element_dist_index_js-node_modules_github_remote-inp-b5f1d7-a1760ffda83d.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_markdown-toolbar-element_dist_index_js-ceef33f593fa.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_file-attachment-element_dist_index_js-node_modules_primer_view-co-c44a69-8094ee2ecc5e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/github-elements-c5fd390b3ba0.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/element-registry-a71c0dc18ea2.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_braintree_browser-detection_dist_browser-detection_js-node_modules_githu-bb80ec-72267f4e3ff9.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_lit-html_lit-html_js-be8cb88f481b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_index_js-node_modules_morphdom_dist_morphdom-e-7c534c-a4a1922eb55f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_turbo_dist_turbo_es2017-esm_js-e3cbe28f1638.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_remote-form_dist_index_js-node_modules_delegated-events_dist_inde-893f9f-6cf3320416b8.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_color-convert_index_js-e3180fe3bcb3.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_quote-selection_dist_index_js-node_modules_github_session-resume_-69cfcc-bc42a18e77d5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_updatable-content_updatable-content_ts-a1563f62660e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_behaviors_task-list_ts-app_assets_modules_github_sso_ts-ui_packages-900dde-035d0557f18e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_sticky-scroll-into-view_ts-3e000c5d31a9.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_behaviors_ajax-error_ts-app_assets_modules_github_behaviors_include-87a4ae-21948f72ce0b.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_behaviors_commenting_edit_ts-app_assets_modules_github_behaviors_ht-83c235-e429cff6ceb1.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/behaviors-45dfd869047c.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_delegated-events_dist_index_js-node_modules_github_catalyst_lib_index_js-f6223d90c7ba.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/notifications-global-01e85cd1be94.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_index_js-node_modules_github_catalyst_lib_inde-dbbea9-26cce2010167.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/code-menu-1c0aedc134b1.js"></script>
  
  <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/primer-react-9a5713772ca5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/react-core-56b50d0313a2.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/react-lib-f1bca44e0926.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/octicons-react-611691cca2f6.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_emotion_is-prop-valid_dist_emotion-is-prop-valid_esm_js-node_modules_emo-62da9f-2df2f32ec596.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_index_js-node_modules_stacktrace-parser_dist_s-e7dcdd-f7cc96ebae76.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_oddbird_popover-polyfill_dist_popover-fn_js-55fea94174bf.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_dompurify_dist_purify_es_mjs-dd1d3ea6a436.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_lodash-es__Stack_js-node_modules_lodash-es__Uint8Array_js-node_modules_l-4faaa6-4a736fde5c2f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_lodash-es__baseIsEqual_js-8929eb9718d5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_hydro-analytics-client_dist_analytics-client_js-node_modules_gith-40531a-09af0ef9a562.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_aria-live_aria-live_ts-ui_packages_promise-with-resolvers-polyfill_promise-with-r-17c672-34345cb18aac.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_paths_index_ts-89633360933d.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_ref-selector_RefSelector_tsx-7496afc3784d.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_diffs_diff-parts_ts-ui_packages_use-file-tree-tooltip_use-file-tree-tooltip_ts-ui-db0a92-6a1f23f93999.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_commit-attribution_index_ts-ui_packages_commit-checks-status_index_ts-ui_packages-762eaa-c6c7f3dd0990.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_code-view-shared_hooks_use-canonical-object_ts-ui_packages_code-view-shared_hooks-a6859a-7a08291f47af.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_document-metadata_document-metadata_ts-ui_packages_repos-file-tree-view_repos-fil-5db355-fda2073071d3.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/app_assets_modules_github_blob-anchor_ts-ui_packages_code-nav_code-nav_ts-ui_packages_filter--8253c1-91468a3354f9.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/react-code-view-1d09e2c36c63.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/react-code-view.ab7d8fac328c00e5e0cc.module.css" />


  <title>Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation</title>



  <meta name="route-pattern" content="/:user_id/:repository/blob/*name(/*path)" data-turbo-transient>
  <meta name="route-controller" content="blob" data-turbo-transient>
  <meta name="route-action" content="show" data-turbo-transient>

    
  <meta name="current-catalog-service-hash" content="f3abb0cc802f3d7b95fc8762b94bdcb13bf39634c40c357301c4aa1d67a256fb">


  <meta name="request-id" content="6867:2056C2:4BD4204:69CA212:67C89D18" data-turbo-transient="true" /><meta name="html-safe-nonce" content="34080f25891217c798bad1f08e21f7a78221dc7776460c0e18ed931c54951727" data-turbo-transient="true" /><meta name="visitor-payload" content="eyJyZWZlcnJlciI6bnVsbCwicmVxdWVzdF9pZCI6IjY4Njc6MjA1NkMyOjRCRDQyMDQ6NjlDQTIxMjo2N0M4OUQxOCIsInZpc2l0b3JfaWQiOiI2MjA5MjM5MzA2ODA5NTgwNzQ2IiwicmVnaW9uX2VkZ2UiOiJpYWQiLCJyZWdpb25fcmVuZGVyIjoiaWFkIn0=" data-turbo-transient="true" /><meta name="visitor-hmac" content="2bc9eae6445b425a0c1587554175b91ba9c1961c64ecb2d282549b8d356c6b44" data-turbo-transient="true" />


    <meta name="hovercard-subject-tag" content="repository:892964695" data-turbo-transient>


  <meta name="github-keyboard-shortcuts" content="repository,source-code,file-tree,copilot" data-turbo-transient="true" />
  

  <meta name="selected-link" value="repo_source" data-turbo-transient>
  <link rel="assets" href="https://github.githubassets.com/">

    <meta name="google-site-verification" content="Apib7-x98H0j5cPqHWwSMm6dNU4GmODRoqxLiDzdx9I">

<meta name="octolytics-url" content="https://collector.github.com/github/collect" /><meta name="octolytics-actor-id" content="10522492" /><meta name="octolytics-actor-login" content="DominiqueLoyer" /><meta name="octolytics-actor-hash" content="b25a0662f961a2e30ec41d42ec859d6d67d5c2cae69ebe1b234fb30a5c9065ba" />

  <meta name="analytics-location" content="/&lt;user-name&gt;/&lt;repo-name&gt;/blob/show" data-turbo-transient="true" />

  




    <meta name="user-login" content="DominiqueLoyer">

  <link rel="sudo-modal" href="/sessions/sudo_modal">

    <meta name="viewport" content="width=device-width">

    

      <meta name="description" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation">

      <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="GitHub">

    <link rel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub">
    <meta property="fb:app_id" content="1401488693436528">
    <meta name="apple-itunes-app" content="app-id=1477376905, app-argument=https://github.com/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py" />

      <meta name="twitter:image" content="https://opengraph.githubassets.com/4ccf9c53fd20503f38ce19dc442ceebebeea1834aa4f9b7cc88751567a72a268/Alimiji/Solr_utilisation" /><meta name="twitter:site" content="@github" /><meta name="twitter:card" content="summary_large_image" /><meta name="twitter:title" content="Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation" /><meta name="twitter:description" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Alimiji/Solr_utilisation" />
  <meta property="og:image" content="https://opengraph.githubassets.com/4ccf9c53fd20503f38ce19dc442ceebebeea1834aa4f9b7cc88751567a72a268/Alimiji/Solr_utilisation" /><meta property="og:image:alt" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Alimiji/Solr_utilisation" /><meta property="og:image:width" content="1200" /><meta property="og:image:height" content="600" /><meta property="og:site_name" content="GitHub" /><meta property="og:type" content="object" /><meta property="og:title" content="Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation" /><meta property="og:url" content="https://github.com/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py" /><meta property="og:description" content="Ce projet nous montre un exemple d&#39;utilisation de Solr - Alimiji/Solr_utilisation" />
  


      <link rel="shared-web-socket" href="wss://alive.github.com/_sockets/u/10522492/ws?session=eyJ2IjoiVjMiLCJ1IjoxMDUyMjQ5MiwicyI6MTYwOTYzMTE0MywiYyI6MTAxMzYyMzUxOCwidCI6MTc0MTIwMDY3NX0=--6e334a30dae31fb8dd1f229fcbdc5b8412072ae39d88ce1a32c2e98f9aa1d786" data-refresh-url="/_alive" data-session-id="6423c81dedc4cb8d1d43f54226f2835dbbd8262cefb6a037bf5d0ee16212847a">
      <link rel="shared-web-socket-src" href="/assets-cdn/worker/socket-worker-eff89a71ae86.js">


      <meta name="hostname" content="github.com">


      <meta name="keyboard-shortcuts-preference" content="all">
      <meta name="hovercards-preference" content="true">
      <meta name="announcement-preference-hovercard" content="true">

        <meta name="expected-hostname" content="github.com">


  <meta http-equiv="x-pjax-version" content="9c2ea41b2e2966407bce85d5843887341d2c6ee06caca0302a931d758119da83" data-turbo-track="reload">
  <meta http-equiv="x-pjax-csp-version" content="1387756d457e2f7c930482f0374bab8f35110d772491ea950a7236d69098c3a6" data-turbo-track="reload">
  <meta http-equiv="x-pjax-css-version" content="a30977995814647d0827c66025b8a8c5cb8722c27765b03e9e34bf066d054640" data-turbo-track="reload">
  <meta http-equiv="x-pjax-js-version" content="cb8098555d9ca0e18729a401b0c59c59f63135738be0b69afd65ef93dbba2394" data-turbo-track="reload">

  <meta name="turbo-cache-control" content="no-preview" data-turbo-transient="">

      <meta name="turbo-cache-control" content="no-cache" data-turbo-transient>

    <meta data-hydrostats="publish">
  <meta name="go-import" content="github.com/Alimiji/Solr_utilisation git https://github.com/Alimiji/Solr_utilisation.git">

  <meta name="octolytics-dimension-user_id" content="60366981" /><meta name="octolytics-dimension-user_login" content="Alimiji" /><meta name="octolytics-dimension-repository_id" content="892964695" /><meta name="octolytics-dimension-repository_nwo" content="Alimiji/Solr_utilisation" /><meta name="octolytics-dimension-repository_public" content="true" /><meta name="octolytics-dimension-repository_is_fork" content="false" /><meta name="octolytics-dimension-repository_network_root_id" content="892964695" /><meta name="octolytics-dimension-repository_network_root_nwo" content="Alimiji/Solr_utilisation" />



    

    <meta name="turbo-body-classes" content="logged-in env-production page-responsive">


  <meta name="browser-stats-url" content="https://api.github.com/_private/browser/stats">

  <meta name="browser-errors-url" content="https://api.github.com/_private/browser/errors">

  <link rel="mask-icon" href="https://github.githubassets.com/assets/pinned-octocat-093da3e6fa40.svg" color="#000000">
  <link rel="alternate icon" class="js-site-favicon" type="image/png" href="https://github.githubassets.com/favicons/favicon.png">
  <link rel="icon" class="js-site-favicon" type="image/svg+xml" href="https://github.githubassets.com/favicons/favicon.svg" data-base-href="https://github.githubassets.com/favicons/favicon">

<meta name="theme-color" content="#1e2327">
<meta name="color-scheme" content="dark light" />

  <link rel="apple-touch-icon" href="https://github.githubassets.com/assets/apple-touch-icon-92bd46d04241.png">
  <link rel="apple-touch-icon" sizes="180x180" href="https://github.githubassets.com/assets/apple-touch-icon-180x180-a80b8e11abe2.png">
  <meta name="apple-mobile-web-app-title" content="GitHub">

  <link rel="manifest" href="/manifest.json" crossOrigin="use-credentials">

  </head>

  <body class="logged-in env-production page-responsive" style="word-wrap: break-word;">
    <div data-turbo-body class="logged-in env-production page-responsive" style="word-wrap: break-word;">
      


    <div class="position-relative header-wrapper js-header-wrapper ">
      <a href="#start-of-content" data-skip-target-assigned="false" class="p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content">Skip to content</a>

      <span data-view-component="true" class="progress-pjax-loader Progress position-fixed width-full">
    <span style="width: 0%;" data-view-component="true" class="Progress-item progress-pjax-loader-bar left-0 top-0 color-bg-accent-emphasis"></span>
</span>      
      
      <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_ui-commands_ui-commands_ts-97496b0f52ba.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/keyboard-shortcuts-dialog-ac448fe050d6.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />

<react-partial
  partial-name="keyboard-shortcuts-dialog"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{"docsUrl":"https://docs.github.com/get-started/accessibility/keyboard-shortcuts"}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>




      

          

              <header class="AppHeader" role="banner">
  <h2 class="sr-only">Navigation Menu</h2>

    

    <div class="AppHeader-globalBar pb-2 js-global-bar">
      <div class="AppHeader-globalBar-start">
          <deferred-side-panel data-url="/_side-panels/global">
  <include-fragment data-target="deferred-side-panel.fragment">
      <button aria-label="Open global navigation menu" data-action="click:deferred-side-panel#loadPanel click:deferred-side-panel#panelOpened" data-show-dialog-id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" id="dialog-show-dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" type="button" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button p-0 color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-three-bars Button-visual">
    <path d="M1 2.75A.75.75 0 0 1 1.75 2h12.5a.75.75 0 0 1 0 1.5H1.75A.75.75 0 0 1 1 2.75Zm0 5A.75.75 0 0 1 1.75 7h12.5a.75.75 0 0 1 0 1.5H1.75A.75.75 0 0 1 1 7.75ZM1.75 12h12.5a.75.75 0 0 1 0 1.5H1.75a.75.75 0 0 1 0-1.5Z"></path>
</svg>
</button>

<dialog-helper>
  <dialog data-target="deferred-side-panel.panel" id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" aria-modal="true" aria-labelledby="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-title" aria-describedby="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-small-portrait Overlay--motion-scaleFade Overlay--placement-left SidePanel Overlay--disableScroll">
    <div styles="flex-direction: row;" data-view-component="true" class="Overlay-header">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title sr-only" id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-title">
        Global navigation
      </h1>
            <div data-view-component="true" class="d-flex">
      <div data-view-component="true" class="AppHeader-logo position-relative">
        <svg aria-hidden="true" height="24" viewBox="0 0 24 24" version="1.1" width="24" data-view-component="true" class="octicon octicon-mark-github">
    <path d="M12.5.75C6.146.75 1 5.896 1 12.25c0 5.089 3.292 9.387 7.863 10.91.575.101.79-.244.79-.546 0-.273-.014-1.178-.014-2.142-2.889.532-3.636-.704-3.866-1.35-.13-.331-.69-1.352-1.18-1.625-.402-.216-.977-.748-.014-.762.906-.014 1.553.834 1.769 1.179 1.035 1.74 2.688 1.25 3.349.948.1-.747.402-1.25.733-1.538-2.559-.287-5.232-1.279-5.232-5.678 0-1.25.445-2.285 1.178-3.09-.115-.288-.517-1.467.115-3.048 0 0 .963-.302 3.163 1.179.92-.259 1.897-.388 2.875-.388.977 0 1.955.13 2.875.388 2.2-1.495 3.162-1.179 3.162-1.179.633 1.581.23 2.76.115 3.048.733.805 1.179 1.825 1.179 3.09 0 4.413-2.688 5.39-5.247 5.678.417.36.776 1.05.776 2.128 0 1.538-.014 2.774-.014 3.162 0 .302.216.662.79.547C20.709 21.637 24 17.324 24 12.25 24 5.896 18.854.75 12.5.75Z"></path>
</svg>
</div></div>
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="dialog-764be29d-8e3e-48d6-83d7-a08bdb0124ba-title">
        <div data-view-component="true" class="Overlay-body d-flex flex-column px-2">    <div data-view-component="true" class="d-flex flex-column mb-3">
        <nav aria-label="Site navigation" data-view-component="true" class="ActionList">
  
  <nav-list>
    <ul data-target="nav-list.topLevelList" data-view-component="true" class="ActionListWrap">
        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-hotkey="g d" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;HOME&quot;,&quot;label&quot;:null}" id="item-566bcd64-5314-4cc1-9879-49a7e769834d" href="/dashboard" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-home">
    <path d="M6.906.664a1.749 1.749 0 0 1 2.187 0l5.25 4.2c.415.332.657.835.657 1.367v7.019A1.75 1.75 0 0 1 13.25 15h-3.5a.75.75 0 0 1-.75-.75V9H7v5.25a.75.75 0 0 1-.75.75h-3.5A1.75 1.75 0 0 1 1 13.25V6.23c0-.531.242-1.034.657-1.366l5.25-4.2Zm1.25 1.171a.25.25 0 0 0-.312 0l-5.25 4.2a.25.25 0 0 0-.094.196v7.019c0 .138.112.25.25.25H5.5V8.25a.75.75 0 0 1 .75-.75h3.5a.75.75 0 0 1 .75.75v5.25h2.75a.25.25 0 0 0 .25-.25V6.23a.25.25 0 0 0-.094-.195Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Home
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-hotkey="g i" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;ISSUES&quot;,&quot;label&quot;:null}" id="item-13c00cba-b6a0-414f-ac68-3e5a2756bb04" href="/issues" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Issues
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-hotkey="g p" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;PULL_REQUESTS&quot;,&quot;label&quot;:null}" id="item-f6846ebc-7fb5-4a24-92c9-2ebd6668a1a9" href="/pulls" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Pull requests
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-item-id="projects" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;PROJECTS&quot;,&quot;label&quot;:null}" id="item-57b9c7dd-fddc-486e-b6e9-ec8e6b7a25d2" href="/projects" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-table">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25ZM6.5 6.5v8h7.75a.25.25 0 0 0 .25-.25V6.5Zm8-1.5V1.75a.25.25 0 0 0-.25-.25H6.5V5Zm-13 1.5v7.75c0 .138.112.25.25.25H5v-8ZM5 5V1.5H1.75a.25.25 0 0 0-.25.25V5Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Projects
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;DISCUSSIONS&quot;,&quot;label&quot;:null}" id="item-d7b5c37d-77d9-4e4f-81c6-af46eb1762fe" href="/discussions" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-comment-discussion">
    <path d="M1.75 1h8.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 10.25 10H7.061l-2.574 2.573A1.458 1.458 0 0 1 2 11.543V10h-.25A1.75 1.75 0 0 1 0 8.25v-5.5C0 1.784.784 1 1.75 1ZM1.5 2.75v5.5c0 .138.112.25.25.25h1a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h3.5a.25.25 0 0 0 .25-.25v-5.5a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25Zm13 2a.25.25 0 0 0-.25-.25h-.5a.75.75 0 0 1 0-1.5h.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 14.25 12H14v1.543a1.458 1.458 0 0 1-2.487 1.03L9.22 12.28a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215l2.22 2.22v-2.19a.75.75 0 0 1 .75-.75h1a.25.25 0 0 0 .25-.25Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Discussions
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;CODESPACES&quot;,&quot;label&quot;:null}" id="item-ec04c48b-5395-41b9-a76f-fc066f0d9c3a" href="https://github.com/codespaces" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-codespaces">
    <path d="M0 11.25c0-.966.784-1.75 1.75-1.75h12.5c.966 0 1.75.784 1.75 1.75v3A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm2-9.5C2 .784 2.784 0 3.75 0h8.5C13.216 0 14 .784 14 1.75v5a1.75 1.75 0 0 1-1.75 1.75h-8.5A1.75 1.75 0 0 1 2 6.75Zm1.75-.25a.25.25 0 0 0-.25.25v5c0 .138.112.25.25.25h8.5a.25.25 0 0 0 .25-.25v-5a.25.25 0 0 0-.25-.25Zm-2 9.5a.25.25 0 0 0-.25.25v3c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-3a.25.25 0 0 0-.25-.25Z"></path><path d="M7 12.75a.75.75 0 0 1 .75-.75h4.5a.75.75 0 0 1 0 1.5h-4.5a.75.75 0 0 1-.75-.75Zm-4 0a.75.75 0 0 1 .75-.75h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1-.75-.75Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Codespaces
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;COPILOT&quot;,&quot;label&quot;:null}" id="item-8552101e-e015-465a-a42b-7132fbad85e3" href="/copilot" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot">
    <path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Copilot
</span>      
</a>
  
</li>

        
          <li role="presentation" aria-hidden="true" data-view-component="true" class="ActionList-sectionDivider"></li>
        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;EXPLORE&quot;,&quot;label&quot;:null}" id="item-d3ef4c1e-5018-4310-90d5-4ffeb6d24b11" href="/explore" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-telescope">
    <path d="M14.184 1.143v-.001l1.422 2.464a1.75 1.75 0 0 1-.757 2.451L3.104 11.713a1.75 1.75 0 0 1-2.275-.702l-.447-.775a1.75 1.75 0 0 1 .53-2.32L11.682.573a1.748 1.748 0 0 1 2.502.57Zm-4.709 9.32h-.001l2.644 3.863a.75.75 0 1 1-1.238.848l-1.881-2.75v2.826a.75.75 0 0 1-1.5 0v-2.826l-1.881 2.75a.75.75 0 1 1-1.238-.848l2.049-2.992a.746.746 0 0 1 .293-.253l1.809-.87a.749.749 0 0 1 .944.252ZM9.436 3.92h-.001l-4.97 3.39.942 1.63 5.42-2.61Zm3.091-2.108h.001l-1.85 1.26 1.505 2.605 2.016-.97a.247.247 0 0 0 .13-.151.247.247 0 0 0-.022-.199l-1.422-2.464a.253.253 0 0 0-.161-.119.254.254 0 0 0-.197.038ZM1.756 9.157a.25.25 0 0 0-.075.33l.447.775a.25.25 0 0 0 .325.1l1.598-.769-.83-1.436-1.465 1Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Explore
</span>      
</a>
  
</li>

        
          
<li data-item-id="" data-targets="nav-list.items" data-view-component="true" class="ActionListItem">
    
    
    <a data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;MARKETPLACE&quot;,&quot;label&quot;:null}" id="item-ddca775f-9cdd-4be3-8339-fd49506a1b9b" href="/marketplace" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-gift">
    <path d="M2 2.75A2.75 2.75 0 0 1 4.75 0c.983 0 1.873.42 2.57 1.232.268.318.497.668.68 1.042.183-.375.411-.725.68-1.044C9.376.42 10.266 0 11.25 0a2.75 2.75 0 0 1 2.45 4h.55c.966 0 1.75.784 1.75 1.75v2c0 .698-.409 1.301-1 1.582v4.918A1.75 1.75 0 0 1 13.25 16H2.75A1.75 1.75 0 0 1 1 14.25V9.332C.409 9.05 0 8.448 0 7.75v-2C0 4.784.784 4 1.75 4h.55c-.192-.375-.3-.8-.3-1.25ZM7.25 9.5H2.5v4.75c0 .138.112.25.25.25h4.5Zm1.5 0v5h4.5a.25.25 0 0 0 .25-.25V9.5Zm0-4V8h5.5a.25.25 0 0 0 .25-.25v-2a.25.25 0 0 0-.25-.25Zm-7 0a.25.25 0 0 0-.25.25v2c0 .138.112.25.25.25h5.5V5.5h-5.5Zm3-4a1.25 1.25 0 0 0 0 2.5h2.309c-.233-.818-.542-1.401-.878-1.793-.43-.502-.915-.707-1.431-.707ZM8.941 4h2.309a1.25 1.25 0 0 0 0-2.5c-.516 0-1 .205-1.43.707-.337.392-.646.975-.879 1.793Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Marketplace
</span>      
</a>
  
</li>

</ul>  </nav-list>
</nav>

        <div data-view-component="true" class="my-3 d-flex flex-justify-center height-full">
          <span data-view-component="true">
  <svg style="box-sizing: content-box; color: var(--color-icon-primary);" width="16" height="16" viewBox="0 0 16 16" fill="none" aria-hidden="true" data-view-component="true" class="anim-rotate">
    <circle cx="8" cy="8" r="7" stroke="currentColor" stroke-opacity="0.25" stroke-width="2" vector-effect="non-scaling-stroke" fill="none" />
    <path d="M15 8a7.002 7.002 0 00-7-7" stroke="currentColor" stroke-width="2" stroke-linecap="round" vector-effect="non-scaling-stroke" />
</svg>    <span class="sr-only">Loading</span>
</span>
</div>
</div>
      <div data-view-component="true" class="flex-1"></div>


      <div data-view-component="true" class="px-2">      <p class="color-fg-subtle text-small text-light">&copy; 2025 GitHub, Inc.</p>

      <div data-view-component="true" class="d-flex flex-wrap text-small text-light">
          <a target="_blank" href="https://github.com/about" data-view-component="true" class="Link mr-2">About</a>
          <a target="_blank" href="https://github.blog" data-view-component="true" class="Link mr-2">Blog</a>
          <a target="_blank" href="https://docs.github.com/site-policy/github-terms/github-terms-of-service" data-view-component="true" class="Link mr-2">Terms</a>
          <a target="_blank" href="https://docs.github.com/site-policy/privacy-policies/github-privacy-statement" data-view-component="true" class="Link mr-2">Privacy</a>
          <a target="_blank" href="https://github.com/security" data-view-component="true" class="Link mr-2">Security</a>
          <a target="_blank" href="https://www.githubstatus.com/" data-view-component="true" class="Link mr-3">Status</a>

</div></div>
</div>
      </scrollable-region>
      
</dialog></dialog-helper>

  </include-fragment>
</deferred-side-panel>

        <a
          class="AppHeader-logo ml-1"
          href="https://github.com/"
          data-hotkey="g d"
          aria-label="Homepage "
          data-turbo="false"
          data-analytics-event="{&quot;category&quot;:&quot;Header&quot;,&quot;action&quot;:&quot;go to dashboard&quot;,&quot;label&quot;:&quot;icon:logo&quot;}"
        >
          <svg height="32" aria-hidden="true" viewBox="0 0 24 24" version="1.1" width="32" data-view-component="true" class="octicon octicon-mark-github v-align-middle">
    <path d="M12.5.75C6.146.75 1 5.896 1 12.25c0 5.089 3.292 9.387 7.863 10.91.575.101.79-.244.79-.546 0-.273-.014-1.178-.014-2.142-2.889.532-3.636-.704-3.866-1.35-.13-.331-.69-1.352-1.18-1.625-.402-.216-.977-.748-.014-.762.906-.014 1.553.834 1.769 1.179 1.035 1.74 2.688 1.25 3.349.948.1-.747.402-1.25.733-1.538-2.559-.287-5.232-1.279-5.232-5.678 0-1.25.445-2.285 1.178-3.09-.115-.288-.517-1.467.115-3.048 0 0 .963-.302 3.163 1.179.92-.259 1.897-.388 2.875-.388.977 0 1.955.13 2.875.388 2.2-1.495 3.162-1.179 3.162-1.179.633 1.581.23 2.76.115 3.048.733.805 1.179 1.825 1.179 3.09 0 4.413-2.688 5.39-5.247 5.678.417.36.776 1.05.776 2.128 0 1.538-.014 2.774-.014 3.162 0 .302.216.662.79.547C20.709 21.637 24 17.324 24 12.25 24 5.896 18.854.75 12.5.75Z"></path>
</svg>
        </a>

          <div class="AppHeader-context" >
  <div class="AppHeader-context-compact">
      <button aria-expanded="false" aria-haspopup="dialog" aria-label="Page context: Alimiji / Solr_utilisation" id="dialog-show-context-region-dialog" data-show-dialog-id="context-region-dialog" type="button" data-view-component="true" class="AppHeader-context-compact-trigger Truncate Button--secondary Button--medium Button box-shadow-none">  <span class="Button-content">
    <span class="Button-label"><span class="AppHeader-context-compact-lead">
                <span class="AppHeader-context-compact-parentItem">Alimiji</span>
                <span class="no-wrap">&nbsp;/</span>

            </span>

            <strong class="AppHeader-context-compact-mainItem d-flex flex-items-center Truncate" >
  <span class="Truncate-text ">Solr_utilisation</span>

</strong></span>
  </span>
</button>

<dialog-helper>
  <dialog id="context-region-dialog" aria-modal="true" aria-labelledby="context-region-dialog-title" aria-describedby="context-region-dialog-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-medium Overlay--motion-scaleFade Overlay--disableScroll">
    <div data-view-component="true" class="Overlay-header">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title " id="context-region-dialog-title">
        Navigate back to
      </h1>
        
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="context-region-dialog" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="context-region-dialog-title">
        <div data-view-component="true" class="Overlay-body">          <ul role="list" class="list-style-none" >
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Alimiji&quot;,&quot;screen_size&quot;:&quot;compact&quot;}" href="/Alimiji" data-view-component="true" class="Link--primary Truncate d-flex flex-items-center py-1">
        <span class="AppHeader-context-item-label Truncate-text ">
            <svg aria-hidden="true" height="12" viewBox="0 0 16 16" version="1.1" width="12" data-view-component="true" class="octicon octicon-person mr-1">
    <path d="M10.561 8.073a6.005 6.005 0 0 1 3.432 5.142.75.75 0 1 1-1.498.07 4.5 4.5 0 0 0-8.99 0 .75.75 0 0 1-1.498-.07 6.004 6.004 0 0 1 3.431-5.142 3.999 3.999 0 1 1 5.123 0ZM10.5 5a2.5 2.5 0 1 0-5 0 2.5 2.5 0 0 0 5 0Z"></path>
</svg>

          Alimiji
        </span>

</a>
    </li>
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Solr_utilisation&quot;,&quot;screen_size&quot;:&quot;compact&quot;}" href="/Alimiji/Solr_utilisation" data-view-component="true" class="Link--primary Truncate d-flex flex-items-center py-1">
        <span class="AppHeader-context-item-label Truncate-text ">
            <svg aria-hidden="true" height="12" viewBox="0 0 16 16" version="1.1" width="12" data-view-component="true" class="octicon octicon-repo mr-1">
    <path d="M2 2.5A2.5 2.5 0 0 1 4.5 0h8.75a.75.75 0 0 1 .75.75v12.5a.75.75 0 0 1-.75.75h-2.5a.75.75 0 0 1 0-1.5h1.75v-2h-8a1 1 0 0 0-.714 1.7.75.75 0 1 1-1.072 1.05A2.495 2.495 0 0 1 2 11.5Zm10.5-1h-8a1 1 0 0 0-1 1v6.708A2.486 2.486 0 0 1 4.5 9h8ZM5 12.25a.25.25 0 0 1 .25-.25h3.5a.25.25 0 0 1 .25.25v3.25a.25.25 0 0 1-.4.2l-1.45-1.087a.249.249 0 0 0-.3 0L5.4 15.7a.25.25 0 0 1-.4-.2Z"></path>
</svg>

          Solr_utilisation
        </span>

</a>
    </li>
</ul>

</div>
      </scrollable-region>
      
</dialog></dialog-helper>
  </div>

  <div class="AppHeader-context-full">
    <nav role="navigation" aria-label="Page context">
      <ul role="list" class="list-style-none" >
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Alimiji&quot;,&quot;screen_size&quot;:&quot;full&quot;}" data-hovercard-type="user" data-hovercard-url="/users/Alimiji/hovercard" data-octo-click="hovercard-link-click" data-octo-dimensions="link_type:self" href="/Alimiji" data-view-component="true" class="AppHeader-context-item">
        <span class="AppHeader-context-item-label  ">

          Alimiji
        </span>

</a>
        <span class="AppHeader-context-item-separator">
          <span class='sr-only'>/</span>
          <svg width="16" height="16" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg" aria-hidden="true">
            <path d="M10.956 1.27994L6.06418 14.7201L5 14.7201L9.89181 1.27994L10.956 1.27994Z" fill="currentcolor"/>
          </svg>
        </span>
    </li>
    <li>
      <a data-analytics-event="{&quot;category&quot;:&quot;SiteHeaderComponent&quot;,&quot;action&quot;:&quot;context_region_crumb&quot;,&quot;label&quot;:&quot;Solr_utilisation&quot;,&quot;screen_size&quot;:&quot;full&quot;}" href="/Alimiji/Solr_utilisation" data-view-component="true" class="AppHeader-context-item">
        <span class="AppHeader-context-item-label  ">

          Solr_utilisation
        </span>

</a>
    </li>
</ul>

    </nav>
  </div>
</div>

      </div>
      <div class="AppHeader-globalBar-end">
          <div class="AppHeader-search" >
              


<qbsearch-input class="search-input" data-scope="repo:Alimiji/Solr_utilisation" data-custom-scopes-path="/search/custom_scopes" data-delete-custom-scopes-csrf="rdhqNea0-49_T7LtfH--8cngXgfDZcScRPssEKqah0KHtWNTLIaO2go46nYCM5GJloMJHw1wkvV_ld96PgSHoA" data-max-custom-scopes="10" data-header-redesign-enabled="true" data-initial-value="" data-blackbird-suggestions-path="/search/suggestions" data-jump-to-suggestions-path="/_graphql/GetSuggestedNavigationDestinations" data-current-repository="Alimiji/Solr_utilisation" data-current-org="" data-current-owner="Alimiji" data-logged-in="true" data-copilot-chat-enabled="true" data-nl-search-enabled="false">
  <div
    class="search-input-container search-with-dialog position-relative d-flex flex-row flex-items-center height-auto color-bg-transparent border-0 color-fg-subtle mx-0"
    data-action="click:qbsearch-input#searchInputContainerClicked"
  >
      
            <button type="button" data-action="click:qbsearch-input#handleExpand" class="AppHeader-button AppHeader-search-whenNarrow" aria-label="Search or jump to…" aria-expanded="false" aria-haspopup="dialog">
            <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
          </button>


<div class="AppHeader-search-whenRegular">
  <div class="AppHeader-search-wrap AppHeader-search-wrap--hasTrailing">
    <div class="AppHeader-search-control AppHeader-search-control-overflow">
      <label
        for="AppHeader-searchInput"
        aria-label="Search or jump to…"
        class="AppHeader-search-visual--leading"
      >
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
      </label>

                <button
            type="button"
            data-target="qbsearch-input.inputButton"
            data-action="click:qbsearch-input#handleExpand"
            class="AppHeader-searchButton form-control input-contrast text-left color-fg-subtle no-wrap"
            data-hotkey="s,/"
            data-analytics-event="{&quot;location&quot;:&quot;navbar&quot;,&quot;action&quot;:&quot;searchbar&quot;,&quot;context&quot;:&quot;global&quot;,&quot;tag&quot;:&quot;input&quot;,&quot;label&quot;:&quot;searchbar_input_global_navbar&quot;}"
            aria-describedby="search-error-message-flash"
          >
            <div class="overflow-hidden">
              <span id="qb-input-query" data-target="qbsearch-input.inputButtonText">
                  Type <kbd class="AppHeader-search-kbd">/</kbd> to search
              </span>
            </div>
          </button>

    </div>


  </div>
</div>

    <input type="hidden" name="type" class="js-site-search-type-field">

    
<div class="Overlay--hidden " data-modal-dialog-overlay>
  <modal-dialog data-action="close:qbsearch-input#handleClose cancel:qbsearch-input#handleClose" data-target="qbsearch-input.searchSuggestionsDialog" role="dialog" id="search-suggestions-dialog" aria-modal="true" aria-labelledby="search-suggestions-dialog-header" data-view-component="true" class="Overlay Overlay--width-medium Overlay--height-auto">
      <h1 id="search-suggestions-dialog-header" class="sr-only">Search code, repositories, users, issues, pull requests...</h1>
    <div class="Overlay-body Overlay-body--paddingNone">
      
          <div data-view-component="true">        <div class="search-suggestions position-absolute width-full color-shadow-large border color-fg-default color-bg-default overflow-hidden d-flex flex-column query-builder-container"
          style="border-radius: 12px;"
          data-target="qbsearch-input.queryBuilderContainer"
          hidden
        >
          <!-- '"` --><!-- </textarea></xmp> --></option></form><form id="query-builder-test-form" action="" accept-charset="UTF-8" method="get">
  <query-builder data-target="qbsearch-input.queryBuilder" id="query-builder-query-builder-test" data-filter-key=":" data-view-component="true" class="QueryBuilder search-query-builder">
    <div class="FormControl FormControl--fullWidth">
      <label id="query-builder-test-label" for="query-builder-test" class="FormControl-label sr-only">
        Search
      </label>
      <div
        class="QueryBuilder-StyledInput width-fit "
        data-target="query-builder.styledInput"
      >
          <span id="query-builder-test-leadingvisual-wrap" class="FormControl-input-leadingVisualWrap QueryBuilder-leadingVisualWrap">
            <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search FormControl-input-leadingVisual">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
          </span>
        <div data-target="query-builder.styledInputContainer" class="QueryBuilder-StyledInputContainer">
          <div
            aria-hidden="true"
            class="QueryBuilder-StyledInputContent"
            data-target="query-builder.styledInputContent"
          ></div>
          <div class="QueryBuilder-InputWrapper">
            <div aria-hidden="true" class="QueryBuilder-Sizer" data-target="query-builder.sizer"></div>
            <input id="query-builder-test" name="query-builder-test" value="" autocomplete="off" type="text" role="combobox" spellcheck="false" aria-expanded="false" aria-describedby="validation-d9842016-babf-4885-ab02-96c584c5e287" data-target="query-builder.input" data-action="
          input:query-builder#inputChange
          blur:query-builder#inputBlur
          keydown:query-builder#inputKeydown
          focus:query-builder#inputFocus
        " data-view-component="true" class="FormControl-input QueryBuilder-Input FormControl-medium" />
          </div>
        </div>
          <span class="sr-only" id="query-builder-test-clear">Clear</span>
          <button role="button" id="query-builder-test-clear-button" aria-labelledby="query-builder-test-clear query-builder-test-label" data-target="query-builder.clearButton" data-action="
                click:query-builder#clear
                focus:query-builder#clearButtonFocus
                blur:query-builder#clearButtonBlur
              " variant="small" hidden="hidden" type="button" data-view-component="true" class="Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x-circle-fill Button-visual">
    <path d="M2.343 13.657A8 8 0 1 1 13.658 2.343 8 8 0 0 1 2.343 13.657ZM6.03 4.97a.751.751 0 0 0-1.042.018.751.751 0 0 0-.018 1.042L6.94 8 4.97 9.97a.749.749 0 0 0 .326 1.275.749.749 0 0 0 .734-.215L8 9.06l1.97 1.97a.749.749 0 0 0 1.275-.326.749.749 0 0 0-.215-.734L9.06 8l1.97-1.97a.749.749 0 0 0-.326-1.275.749.749 0 0 0-.734.215L8 6.94Z"></path>
</svg>
</button>

      </div>
      <template id="search-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-search">
    <path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path>
</svg>
</template>

<template id="code-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code">
    <path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215Zm-6.56 0a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
</template>

<template id="file-code-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-file-code">
    <path d="M4 1.75C4 .784 4.784 0 5.75 0h5.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v8.586A1.75 1.75 0 0 1 14.25 15h-9a.75.75 0 0 1 0-1.5h9a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 10 4.25V1.5H5.75a.25.25 0 0 0-.25.25v2.5a.75.75 0 0 1-1.5 0Zm1.72 4.97a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734l1.47-1.47-1.47-1.47a.75.75 0 0 1 0-1.06ZM3.28 7.78 1.81 9.25l1.47 1.47a.751.751 0 0 1-.018 1.042.751.751 0 0 1-1.042.018l-2-2a.75.75 0 0 1 0-1.06l2-2a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Zm8.22-6.218V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path>
</svg>
</template>

<template id="history-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-history">
    <path d="m.427 1.927 1.215 1.215a8.002 8.002 0 1 1-1.6 5.685.75.75 0 1 1 1.493-.154 6.5 6.5 0 1 0 1.18-4.458l1.358 1.358A.25.25 0 0 1 3.896 6H.25A.25.25 0 0 1 0 5.75V2.104a.25.25 0 0 1 .427-.177ZM7.75 4a.75.75 0 0 1 .75.75v2.992l2.028.812a.75.75 0 0 1-.557 1.392l-2.5-1A.751.751 0 0 1 7 8.25v-3.5A.75.75 0 0 1 7.75 4Z"></path>
</svg>
</template>

<template id="repo-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-repo">
    <path d="M2 2.5A2.5 2.5 0 0 1 4.5 0h8.75a.75.75 0 0 1 .75.75v12.5a.75.75 0 0 1-.75.75h-2.5a.75.75 0 0 1 0-1.5h1.75v-2h-8a1 1 0 0 0-.714 1.7.75.75 0 1 1-1.072 1.05A2.495 2.495 0 0 1 2 11.5Zm10.5-1h-8a1 1 0 0 0-1 1v6.708A2.486 2.486 0 0 1 4.5 9h8ZM5 12.25a.25.25 0 0 1 .25-.25h3.5a.25.25 0 0 1 .25.25v3.25a.25.25 0 0 1-.4.2l-1.45-1.087a.249.249 0 0 0-.3 0L5.4 15.7a.25.25 0 0 1-.4-.2Z"></path>
</svg>
</template>

<template id="bookmark-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-bookmark">
    <path d="M3 2.75C3 1.784 3.784 1 4.75 1h6.5c.966 0 1.75.784 1.75 1.75v11.5a.75.75 0 0 1-1.227.579L8 11.722l-3.773 3.107A.751.751 0 0 1 3 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v9.91l3.023-2.489a.75.75 0 0 1 .954 0l3.023 2.49V2.75a.25.25 0 0 0-.25-.25Z"></path>
</svg>
</template>

<template id="plus-circle-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-plus-circle">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm7.25-3.25v2.5h2.5a.75.75 0 0 1 0 1.5h-2.5v2.5a.75.75 0 0 1-1.5 0v-2.5h-2.5a.75.75 0 0 1 0-1.5h2.5v-2.5a.75.75 0 0 1 1.5 0Z"></path>
</svg>
</template>

<template id="circle-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-dot-fill">
    <path d="M8 4a4 4 0 1 1 0 8 4 4 0 0 1 0-8Z"></path>
</svg>
</template>

<template id="trash-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-trash">
    <path d="M11 1.75V3h2.25a.75.75 0 0 1 0 1.5H2.75a.75.75 0 0 1 0-1.5H5V1.75C5 .784 5.784 0 6.75 0h2.5C10.216 0 11 .784 11 1.75ZM4.496 6.675l.66 6.6a.25.25 0 0 0 .249.225h5.19a.25.25 0 0 0 .249-.225l.66-6.6a.75.75 0 0 1 1.492.149l-.66 6.6A1.748 1.748 0 0 1 10.595 15h-5.19a1.75 1.75 0 0 1-1.741-1.575l-.66-6.6a.75.75 0 1 1 1.492-.15ZM6.5 1.75V3h3V1.75a.25.25 0 0 0-.25-.25h-2.5a.25.25 0 0 0-.25.25Z"></path>
</svg>
</template>

<template id="team-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-people">
    <path d="M2 5.5a3.5 3.5 0 1 1 5.898 2.549 5.508 5.508 0 0 1 3.034 4.084.75.75 0 1 1-1.482.235 4 4 0 0 0-7.9 0 .75.75 0 0 1-1.482-.236A5.507 5.507 0 0 1 3.102 8.05 3.493 3.493 0 0 1 2 5.5ZM11 4a3.001 3.001 0 0 1 2.22 5.018 5.01 5.01 0 0 1 2.56 3.012.749.749 0 0 1-.885.954.752.752 0 0 1-.549-.514 3.507 3.507 0 0 0-2.522-2.372.75.75 0 0 1-.574-.73v-.352a.75.75 0 0 1 .416-.672A1.5 1.5 0 0 0 11 5.5.75.75 0 0 1 11 4Zm-5.5-.5a2 2 0 1 0-.001 3.999A2 2 0 0 0 5.5 3.5Z"></path>
</svg>
</template>

<template id="project-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-project">
    <path d="M1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25V1.75C0 .784.784 0 1.75 0ZM1.5 1.75v12.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25ZM11.75 3a.75.75 0 0 1 .75.75v7.5a.75.75 0 0 1-1.5 0v-7.5a.75.75 0 0 1 .75-.75Zm-8.25.75a.75.75 0 0 1 1.5 0v5.5a.75.75 0 0 1-1.5 0ZM8 3a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 3Z"></path>
</svg>
</template>

<template id="pencil-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-pencil">
    <path d="M11.013 1.427a1.75 1.75 0 0 1 2.474 0l1.086 1.086a1.75 1.75 0 0 1 0 2.474l-8.61 8.61c-.21.21-.47.364-.756.445l-3.251.93a.75.75 0 0 1-.927-.928l.929-3.25c.081-.286.235-.547.445-.758l8.61-8.61Zm.176 4.823L9.75 4.81l-6.286 6.287a.253.253 0 0 0-.064.108l-.558 1.953 1.953-.558a.253.253 0 0 0 .108-.064Zm1.238-3.763a.25.25 0 0 0-.354 0L10.811 3.75l1.439 1.44 1.263-1.263a.25.25 0 0 0 0-.354Z"></path>
</svg>
</template>

<template id="copilot-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot">
    <path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path>
</svg>
</template>

<template id="copilot-error-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot-error">
    <path d="M16 11.24c0 .112-.072.274-.21.467L13 9.688V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-.198 0-.388-.009-.571-.029L6.833 5.226a4.01 4.01 0 0 0 .17-.782c.117-.935-.037-1.395-.241-1.614-.193-.206-.637-.413-1.682-.297-.683.076-1.115.231-1.395.415l-1.257-.91c.579-.564 1.413-.877 2.485-.996 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095Zm-5.083-8.707c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Zm2.511 11.074c-1.393.776-3.272 1.428-5.43 1.428-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.18-.455-.241-.963-.252-1.475L.31 4.107A.747.747 0 0 1 0 3.509V3.49a.748.748 0 0 1 .625-.73c.156-.026.306.047.435.139l14.667 10.578a.592.592 0 0 1 .227.264.752.752 0 0 1 .046.249v.022a.75.75 0 0 1-1.19.596Zm-1.367-.991L5.635 7.964a5.128 5.128 0 0 1-.889.073c-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433 1.539 0 3.089-.505 4.063-.934Z"></path>
</svg>
</template>

<template id="workflow-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-workflow">
    <path d="M0 1.75C0 .784.784 0 1.75 0h3.5C6.216 0 7 .784 7 1.75v3.5A1.75 1.75 0 0 1 5.25 7H4v4a1 1 0 0 0 1 1h4v-1.25C9 9.784 9.784 9 10.75 9h3.5c.966 0 1.75.784 1.75 1.75v3.5A1.75 1.75 0 0 1 14.25 16h-3.5A1.75 1.75 0 0 1 9 14.25v-.75H5A2.5 2.5 0 0 1 2.5 11V7h-.75A1.75 1.75 0 0 1 0 5.25Zm1.75-.25a.25.25 0 0 0-.25.25v3.5c0 .138.112.25.25.25h3.5a.25.25 0 0 0 .25-.25v-3.5a.25.25 0 0 0-.25-.25Zm9 9a.25.25 0 0 0-.25.25v3.5c0 .138.112.25.25.25h3.5a.25.25 0 0 0 .25-.25v-3.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
</template>

<template id="book-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-book">
    <path d="M0 1.75A.75.75 0 0 1 .75 1h4.253c1.227 0 2.317.59 3 1.501A3.743 3.743 0 0 1 11.006 1h4.245a.75.75 0 0 1 .75.75v10.5a.75.75 0 0 1-.75.75h-4.507a2.25 2.25 0 0 0-1.591.659l-.622.621a.75.75 0 0 1-1.06 0l-.622-.621A2.25 2.25 0 0 0 5.258 13H.75a.75.75 0 0 1-.75-.75Zm7.251 10.324.004-5.073-.002-2.253A2.25 2.25 0 0 0 5.003 2.5H1.5v9h3.757a3.75 3.75 0 0 1 1.994.574ZM8.755 4.75l-.004 7.322a3.752 3.752 0 0 1 1.992-.572H14.5v-9h-3.495a2.25 2.25 0 0 0-2.25 2.25Z"></path>
</svg>
</template>

<template id="code-review-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code-review">
    <path d="M1.75 1h12.5c.966 0 1.75.784 1.75 1.75v8.5A1.75 1.75 0 0 1 14.25 13H8.061l-2.574 2.573A1.458 1.458 0 0 1 3 14.543V13H1.75A1.75 1.75 0 0 1 0 11.25v-8.5C0 1.784.784 1 1.75 1ZM1.5 2.75v8.5c0 .138.112.25.25.25h2a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h6.5a.25.25 0 0 0 .25-.25v-8.5a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25Zm5.28 1.72a.75.75 0 0 1 0 1.06L5.31 7l1.47 1.47a.751.751 0 0 1-.018 1.042.751.751 0 0 1-1.042.018l-2-2a.75.75 0 0 1 0-1.06l2-2a.75.75 0 0 1 1.06 0Zm2.44 0a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L10.69 7 9.22 5.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
</template>

<template id="codespaces-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-codespaces">
    <path d="M0 11.25c0-.966.784-1.75 1.75-1.75h12.5c.966 0 1.75.784 1.75 1.75v3A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm2-9.5C2 .784 2.784 0 3.75 0h8.5C13.216 0 14 .784 14 1.75v5a1.75 1.75 0 0 1-1.75 1.75h-8.5A1.75 1.75 0 0 1 2 6.75Zm1.75-.25a.25.25 0 0 0-.25.25v5c0 .138.112.25.25.25h8.5a.25.25 0 0 0 .25-.25v-5a.25.25 0 0 0-.25-.25Zm-2 9.5a.25.25 0 0 0-.25.25v3c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-3a.25.25 0 0 0-.25-.25Z"></path><path d="M7 12.75a.75.75 0 0 1 .75-.75h4.5a.75.75 0 0 1 0 1.5h-4.5a.75.75 0 0 1-.75-.75Zm-4 0a.75.75 0 0 1 .75-.75h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1-.75-.75Z"></path>
</svg>
</template>

<template id="comment-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-comment">
    <path d="M1 2.75C1 1.784 1.784 1 2.75 1h10.5c.966 0 1.75.784 1.75 1.75v7.5A1.75 1.75 0 0 1 13.25 12H9.06l-2.573 2.573A1.458 1.458 0 0 1 4 13.543V12H2.75A1.75 1.75 0 0 1 1 10.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h2a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h4.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
</template>

<template id="comment-discussion-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-comment-discussion">
    <path d="M1.75 1h8.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 10.25 10H7.061l-2.574 2.573A1.458 1.458 0 0 1 2 11.543V10h-.25A1.75 1.75 0 0 1 0 8.25v-5.5C0 1.784.784 1 1.75 1ZM1.5 2.75v5.5c0 .138.112.25.25.25h1a.75.75 0 0 1 .75.75v2.19l2.72-2.72a.749.749 0 0 1 .53-.22h3.5a.25.25 0 0 0 .25-.25v-5.5a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25Zm13 2a.25.25 0 0 0-.25-.25h-.5a.75.75 0 0 1 0-1.5h.5c.966 0 1.75.784 1.75 1.75v5.5A1.75 1.75 0 0 1 14.25 12H14v1.543a1.458 1.458 0 0 1-2.487 1.03L9.22 12.28a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215l2.22 2.22v-2.19a.75.75 0 0 1 .75-.75h1a.25.25 0 0 0 .25-.25Z"></path>
</svg>
</template>

<template id="organization-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-organization">
    <path d="M1.75 16A1.75 1.75 0 0 1 0 14.25V1.75C0 .784.784 0 1.75 0h8.5C11.216 0 12 .784 12 1.75v12.5c0 .085-.006.168-.018.25h2.268a.25.25 0 0 0 .25-.25V8.285a.25.25 0 0 0-.111-.208l-1.055-.703a.749.749 0 1 1 .832-1.248l1.055.703c.487.325.779.871.779 1.456v5.965A1.75 1.75 0 0 1 14.25 16h-3.5a.766.766 0 0 1-.197-.026c-.099.017-.2.026-.303.026h-3a.75.75 0 0 1-.75-.75V14h-1v1.25a.75.75 0 0 1-.75.75Zm-.25-1.75c0 .138.112.25.25.25H4v-1.25a.75.75 0 0 1 .75-.75h2.5a.75.75 0 0 1 .75.75v1.25h2.25a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25ZM3.75 6h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5ZM3 3.75A.75.75 0 0 1 3.75 3h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 3 3.75Zm4 3A.75.75 0 0 1 7.75 6h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 7 6.75ZM7.75 3h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5ZM3 9.75A.75.75 0 0 1 3.75 9h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 3 9.75ZM7.75 9h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5Z"></path>
</svg>
</template>

<template id="rocket-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-rocket">
    <path d="M14.064 0h.186C15.216 0 16 .784 16 1.75v.186a8.752 8.752 0 0 1-2.564 6.186l-.458.459c-.314.314-.641.616-.979.904v3.207c0 .608-.315 1.172-.833 1.49l-2.774 1.707a.749.749 0 0 1-1.11-.418l-.954-3.102a1.214 1.214 0 0 1-.145-.125L3.754 9.816a1.218 1.218 0 0 1-.124-.145L.528 8.717a.749.749 0 0 1-.418-1.11l1.71-2.774A1.748 1.748 0 0 1 3.31 4h3.204c.288-.338.59-.665.904-.979l.459-.458A8.749 8.749 0 0 1 14.064 0ZM8.938 3.623h-.002l-.458.458c-.76.76-1.437 1.598-2.02 2.5l-1.5 2.317 2.143 2.143 2.317-1.5c.902-.583 1.74-1.26 2.499-2.02l.459-.458a7.25 7.25 0 0 0 2.123-5.127V1.75a.25.25 0 0 0-.25-.25h-.186a7.249 7.249 0 0 0-5.125 2.123ZM3.56 14.56c-.732.732-2.334 1.045-3.005 1.148a.234.234 0 0 1-.201-.064.234.234 0 0 1-.064-.201c.103-.671.416-2.273 1.15-3.003a1.502 1.502 0 1 1 2.12 2.12Zm6.94-3.935c-.088.06-.177.118-.266.175l-2.35 1.521.548 1.783 1.949-1.2a.25.25 0 0 0 .119-.213ZM3.678 8.116 5.2 5.766c.058-.09.117-.178.176-.266H3.309a.25.25 0 0 0-.213.119l-1.2 1.95ZM12 5a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
</template>

<template id="shield-check-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-shield-check">
    <path d="m8.533.133 5.25 1.68A1.75 1.75 0 0 1 15 3.48V7c0 1.566-.32 3.182-1.303 4.682-.983 1.498-2.585 2.813-5.032 3.855a1.697 1.697 0 0 1-1.33 0c-2.447-1.042-4.049-2.357-5.032-3.855C1.32 10.182 1 8.566 1 7V3.48a1.75 1.75 0 0 1 1.217-1.667l5.25-1.68a1.748 1.748 0 0 1 1.066 0Zm-.61 1.429.001.001-5.25 1.68a.251.251 0 0 0-.174.237V7c0 1.36.275 2.666 1.057 3.859.784 1.194 2.121 2.342 4.366 3.298a.196.196 0 0 0 .154 0c2.245-.957 3.582-2.103 4.366-3.297C13.225 9.666 13.5 8.358 13.5 7V3.48a.25.25 0 0 0-.174-.238l-5.25-1.68a.25.25 0 0 0-.153 0ZM11.28 6.28l-3.5 3.5a.75.75 0 0 1-1.06 0l-1.5-1.5a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215l.97.97 2.97-2.97a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
</template>

<template id="heart-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-heart">
    <path d="m8 14.25.345.666a.75.75 0 0 1-.69 0l-.008-.004-.018-.01a7.152 7.152 0 0 1-.31-.17 22.055 22.055 0 0 1-3.434-2.414C2.045 10.731 0 8.35 0 5.5 0 2.836 2.086 1 4.25 1 5.797 1 7.153 1.802 8 3.02 8.847 1.802 10.203 1 11.75 1 13.914 1 16 2.836 16 5.5c0 2.85-2.045 5.231-3.885 6.818a22.066 22.066 0 0 1-3.744 2.584l-.018.01-.006.003h-.002ZM4.25 2.5c-1.336 0-2.75 1.164-2.75 3 0 2.15 1.58 4.144 3.365 5.682A20.58 20.58 0 0 0 8 13.393a20.58 20.58 0 0 0 3.135-2.211C12.92 9.644 14.5 7.65 14.5 5.5c0-1.836-1.414-3-2.75-3-1.373 0-2.609.986-3.029 2.456a.749.749 0 0 1-1.442 0C6.859 3.486 5.623 2.5 4.25 2.5Z"></path>
</svg>
</template>

<template id="server-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-server">
    <path d="M1.75 1h12.5c.966 0 1.75.784 1.75 1.75v4c0 .372-.116.717-.314 1 .198.283.314.628.314 1v4a1.75 1.75 0 0 1-1.75 1.75H1.75A1.75 1.75 0 0 1 0 12.75v-4c0-.358.109-.707.314-1a1.739 1.739 0 0 1-.314-1v-4C0 1.784.784 1 1.75 1ZM1.5 2.75v4c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-4a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25Zm.25 5.75a.25.25 0 0 0-.25.25v4c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-4a.25.25 0 0 0-.25-.25ZM7 4.75A.75.75 0 0 1 7.75 4h4.5a.75.75 0 0 1 0 1.5h-4.5A.75.75 0 0 1 7 4.75ZM7.75 10h4.5a.75.75 0 0 1 0 1.5h-4.5a.75.75 0 0 1 0-1.5ZM3 4.75A.75.75 0 0 1 3.75 4h.5a.75.75 0 0 1 0 1.5h-.5A.75.75 0 0 1 3 4.75ZM3.75 10h.5a.75.75 0 0 1 0 1.5h-.5a.75.75 0 0 1 0-1.5Z"></path>
</svg>
</template>

<template id="globe-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-globe">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM5.78 8.75a9.64 9.64 0 0 0 1.363 4.177c.255.426.542.832.857 1.215.245-.296.551-.705.857-1.215A9.64 9.64 0 0 0 10.22 8.75Zm4.44-1.5a9.64 9.64 0 0 0-1.363-4.177c-.307-.51-.612-.919-.857-1.215a9.927 9.927 0 0 0-.857 1.215A9.64 9.64 0 0 0 5.78 7.25Zm-5.944 1.5H1.543a6.507 6.507 0 0 0 4.666 5.5c-.123-.181-.24-.365-.352-.552-.715-1.192-1.437-2.874-1.581-4.948Zm-2.733-1.5h2.733c.144-2.074.866-3.756 1.58-4.948.12-.197.237-.381.353-.552a6.507 6.507 0 0 0-4.666 5.5Zm10.181 1.5c-.144 2.074-.866 3.756-1.58 4.948-.12.197-.237.381-.353.552a6.507 6.507 0 0 0 4.666-5.5Zm2.733-1.5a6.507 6.507 0 0 0-4.666-5.5c.123.181.24.365.353.552.714 1.192 1.436 2.874 1.58 4.948Z"></path>
</svg>
</template>

<template id="issue-opened-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
</template>

<template id="device-mobile-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-device-mobile">
    <path d="M3.75 0h8.5C13.216 0 14 .784 14 1.75v12.5A1.75 1.75 0 0 1 12.25 16h-8.5A1.75 1.75 0 0 1 2 14.25V1.75C2 .784 2.784 0 3.75 0ZM3.5 1.75v12.5c0 .138.112.25.25.25h8.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25h-8.5a.25.25 0 0 0-.25.25ZM8 13a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path>
</svg>
</template>

<template id="package-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-package">
    <path d="m8.878.392 5.25 3.045c.54.314.872.89.872 1.514v6.098a1.75 1.75 0 0 1-.872 1.514l-5.25 3.045a1.75 1.75 0 0 1-1.756 0l-5.25-3.045A1.75 1.75 0 0 1 1 11.049V4.951c0-.624.332-1.201.872-1.514L7.122.392a1.75 1.75 0 0 1 1.756 0ZM7.875 1.69l-4.63 2.685L8 7.133l4.755-2.758-4.63-2.685a.248.248 0 0 0-.25 0ZM2.5 5.677v5.372c0 .09.047.171.125.216l4.625 2.683V8.432Zm6.25 8.271 4.625-2.683a.25.25 0 0 0 .125-.216V5.677L8.75 8.432Z"></path>
</svg>
</template>

<template id="credit-card-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-credit-card">
    <path d="M10.75 9a.75.75 0 0 0 0 1.5h1.5a.75.75 0 0 0 0-1.5h-1.5Z"></path><path d="M0 3.75C0 2.784.784 2 1.75 2h12.5c.966 0 1.75.784 1.75 1.75v8.5A1.75 1.75 0 0 1 14.25 14H1.75A1.75 1.75 0 0 1 0 12.25ZM14.5 6.5h-13v5.75c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25Zm0-2.75a.25.25 0 0 0-.25-.25H1.75a.25.25 0 0 0-.25.25V5h13Z"></path>
</svg>
</template>

<template id="play-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-play">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm4.879-2.773 4.264 2.559a.25.25 0 0 1 0 .428l-4.264 2.559A.25.25 0 0 1 6 10.559V5.442a.25.25 0 0 1 .379-.215Z"></path>
</svg>
</template>

<template id="gift-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-gift">
    <path d="M2 2.75A2.75 2.75 0 0 1 4.75 0c.983 0 1.873.42 2.57 1.232.268.318.497.668.68 1.042.183-.375.411-.725.68-1.044C9.376.42 10.266 0 11.25 0a2.75 2.75 0 0 1 2.45 4h.55c.966 0 1.75.784 1.75 1.75v2c0 .698-.409 1.301-1 1.582v4.918A1.75 1.75 0 0 1 13.25 16H2.75A1.75 1.75 0 0 1 1 14.25V9.332C.409 9.05 0 8.448 0 7.75v-2C0 4.784.784 4 1.75 4h.55c-.192-.375-.3-.8-.3-1.25ZM7.25 9.5H2.5v4.75c0 .138.112.25.25.25h4.5Zm1.5 0v5h4.5a.25.25 0 0 0 .25-.25V9.5Zm0-4V8h5.5a.25.25 0 0 0 .25-.25v-2a.25.25 0 0 0-.25-.25Zm-7 0a.25.25 0 0 0-.25.25v2c0 .138.112.25.25.25h5.5V5.5h-5.5Zm3-4a1.25 1.25 0 0 0 0 2.5h2.309c-.233-.818-.542-1.401-.878-1.793-.43-.502-.915-.707-1.431-.707ZM8.941 4h2.309a1.25 1.25 0 0 0 0-2.5c-.516 0-1 .205-1.43.707-.337.392-.646.975-.879 1.793Z"></path>
</svg>
</template>

<template id="code-square-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code-square">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25Zm7.47 3.97a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L10.69 8 9.22 6.53a.75.75 0 0 1 0-1.06ZM6.78 6.53 5.31 8l1.47 1.47a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215l-2-2a.75.75 0 0 1 0-1.06l2-2a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
</template>

<template id="device-desktop-icon">
  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-device-desktop">
    <path d="M14.25 1c.966 0 1.75.784 1.75 1.75v7.5A1.75 1.75 0 0 1 14.25 12h-3.727c.099 1.041.52 1.872 1.292 2.757A.752.752 0 0 1 11.25 16h-6.5a.75.75 0 0 1-.565-1.243c.772-.885 1.192-1.716 1.292-2.757H1.75A1.75 1.75 0 0 1 0 10.25v-7.5C0 1.784.784 1 1.75 1ZM1.75 2.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25ZM9.018 12H6.982a5.72 5.72 0 0 1-.765 2.5h3.566a5.72 5.72 0 0 1-.765-2.5Z"></path>
</svg>
</template>

        <div class="position-relative">
                <ul
                  role="listbox"
                  class="ActionListWrap QueryBuilder-ListWrap"
                  aria-label="Suggestions"
                  data-action="
                    combobox-commit:query-builder#comboboxCommit
                    mousedown:query-builder#resultsMousedown
                  "
                  data-target="query-builder.resultsList"
                  data-persist-list=false
                  id="query-builder-test-results"
                ></ul>
        </div>
      <div class="FormControl-inlineValidation" id="validation-d9842016-babf-4885-ab02-96c584c5e287" hidden="hidden">
        <span class="FormControl-inlineValidation--visual">
          <svg aria-hidden="true" height="12" viewBox="0 0 12 12" version="1.1" width="12" data-view-component="true" class="octicon octicon-alert-fill">
    <path d="M4.855.708c.5-.896 1.79-.896 2.29 0l4.675 8.351a1.312 1.312 0 0 1-1.146 1.954H1.33A1.313 1.313 0 0 1 .183 9.058ZM7 7V3H5v4Zm-1 3a1 1 0 1 0 0-2 1 1 0 0 0 0 2Z"></path>
</svg>
        </span>
        <span></span>
</div>    </div>
    <div data-target="query-builder.screenReaderFeedback" aria-live="polite" aria-atomic="true" class="sr-only"></div>
</query-builder></form>
          <div class="d-flex flex-row color-fg-muted px-3 text-small color-bg-default search-feedback-prompt">
            <a target="_blank" href="https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax" data-view-component="true" class="Link color-fg-accent text-normal ml-2">Search syntax tips</a>            <div class="d-flex flex-1"></div>
              <button data-action="click:qbsearch-input#showFeedbackDialog" type="button" data-view-component="true" class="Button--link Button--medium Button color-fg-accent text-normal ml-2">  <span class="Button-content">
    <span class="Button-label">Give feedback</span>
  </span>
</button>
          </div>
        </div>
</div>

    </div>
</modal-dialog></div>
  </div>
  <div data-action="click:qbsearch-input#retract" class="dark-backdrop position-fixed" hidden data-target="qbsearch-input.darkBackdrop"></div>
  <div class="color-fg-default">
    
<dialog-helper>
  <dialog data-target="qbsearch-input.feedbackDialog" data-action="close:qbsearch-input#handleDialogClose cancel:qbsearch-input#handleDialogClose" id="feedback-dialog" aria-modal="true" aria-labelledby="feedback-dialog-title" aria-describedby="feedback-dialog-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-medium Overlay--motion-scaleFade Overlay--disableScroll">
    <div data-view-component="true" class="Overlay-header">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title " id="feedback-dialog-title">
        Provide feedback
      </h1>
        
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="feedback-dialog" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="feedback-dialog-title">
        <div data-view-component="true" class="Overlay-body">        <!-- '"` --><!-- </textarea></xmp> --></option></form><form id="code-search-feedback-form" data-turbo="false" action="/search/feedback" accept-charset="UTF-8" method="post"><input type="hidden" name="authenticity_token" value="ZTpqdXLFfpkJrBjOSS9lfojp8rn-KTyQTEd_-ZizBkbxGBfhStEMP65Qqzpcp8CrgrLzWbatqLgLiTUdLy-Izw" />
          <p>We read every piece of feedback, and take your input very seriously.</p>
          <textarea name="feedback" class="form-control width-full mb-2" style="height: 120px" id="feedback"></textarea>
          <input name="include_email" id="include_email" aria-label="Include my email address so I can be contacted" class="form-control mr-2" type="checkbox">
          <label for="include_email" style="font-weight: normal">Include my email address so I can be contacted</label>
</form></div>
      </scrollable-region>
      <div data-view-component="true" class="Overlay-footer Overlay-footer--alignEnd">          <button data-close-dialog-id="feedback-dialog" type="button" data-view-component="true" class="btn">    Cancel
</button>
          <button form="code-search-feedback-form" data-action="click:qbsearch-input#submitFeedback" type="submit" data-view-component="true" class="btn-primary btn">    Submit feedback
</button>
</div>
</dialog></dialog-helper>

    <custom-scopes data-target="qbsearch-input.customScopesManager">
    
<dialog-helper>
  <dialog data-target="custom-scopes.customScopesModalDialog" data-action="close:qbsearch-input#handleDialogClose cancel:qbsearch-input#handleDialogClose" id="custom-scopes-dialog" aria-modal="true" aria-labelledby="custom-scopes-dialog-title" aria-describedby="custom-scopes-dialog-description" data-view-component="true" class="Overlay Overlay-whenNarrow Overlay--size-medium Overlay--motion-scaleFade Overlay--disableScroll">
    <div data-view-component="true" class="Overlay-header Overlay-header--divided">
  <div class="Overlay-headerContentWrap">
    <div class="Overlay-titleWrap">
      <h1 class="Overlay-title " id="custom-scopes-dialog-title">
        Saved searches
      </h1>
        <h2 id="custom-scopes-dialog-description" class="Overlay-description">Use saved searches to filter your results more quickly</h2>
    </div>
    <div class="Overlay-actionWrap">
      <button data-close-dialog-id="custom-scopes-dialog" aria-label="Close" type="button" data-view-component="true" class="close-button Overlay-closeButton"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg></button>
    </div>
  </div>
  
</div>
      <scrollable-region data-labelled-by="custom-scopes-dialog-title">
        <div data-view-component="true" class="Overlay-body">        <div data-target="custom-scopes.customScopesModalDialogFlash"></div>

        <div hidden class="create-custom-scope-form" data-target="custom-scopes.createCustomScopeForm">
        <!-- '"` --><!-- </textarea></xmp> --></option></form><form id="custom-scopes-dialog-form" data-turbo="false" action="/search/custom_scopes" accept-charset="UTF-8" method="post"><input type="hidden" name="authenticity_token" value="SXBkbsF5E9vmK3WWTsbUxkrZi2_4L_TKJkpjKL2T8CG0GsgqpLMUoCX-BVJRhpRS4s-Hzndg1GOD9bJFl9zkWQ" />
          <div data-target="custom-scopes.customScopesModalDialogFlash"></div>

          <input type="hidden" id="custom_scope_id" name="custom_scope_id" data-target="custom-scopes.customScopesIdField">

          <div class="form-group">
            <label for="custom_scope_name">Name</label>
            <auto-check src="/search/custom_scopes/check_name" required only-validate-on-blur="false">
              <input
                type="text"
                name="custom_scope_name"
                id="custom_scope_name"
                data-target="custom-scopes.customScopesNameField"
                class="form-control"
                autocomplete="off"
                placeholder="github-ruby"
                required
                maxlength="50">
              <input type="hidden" value="UeNph_pvhwsQUrYnhK6yIc6X4j_9pyzGcEAdVWpcVElJLXWenI1twHWgXXiu5714PE7tOTmtNZu_ULML2OZ8dA" data-csrf="true" />
            </auto-check>
          </div>

          <div class="form-group">
            <label for="custom_scope_query">Query</label>
            <input
              type="text"
              name="custom_scope_query"
              id="custom_scope_query"
              data-target="custom-scopes.customScopesQueryField"
              class="form-control"
              autocomplete="off"
              placeholder="(repo:mona/a OR repo:mona/b) AND lang:python"
              required
              maxlength="500">
          </div>

          <p class="text-small color-fg-muted">
            To see all available qualifiers, see our <a class="Link--inTextBlock" href="https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax">documentation</a>.
          </p>
</form>        </div>

        <div data-target="custom-scopes.manageCustomScopesForm">
          <div data-target="custom-scopes.list"></div>
        </div>

</div>
      </scrollable-region>
      <div data-view-component="true" class="Overlay-footer Overlay-footer--alignEnd Overlay-footer--divided">          <button data-action="click:custom-scopes#customScopesCancel" type="button" data-view-component="true" class="btn">    Cancel
</button>
          <button form="custom-scopes-dialog-form" data-action="click:custom-scopes#customScopesSubmit" data-target="custom-scopes.customScopesSubmitButton" type="submit" data-view-component="true" class="btn-primary btn">    Create saved search
</button>
</div>
</dialog></dialog-helper>
    </custom-scopes>
  </div>
</qbsearch-input>  <input type="hidden" value="79O93enXNNAI0qKarZtL7YvaPXY__VK5vcLjdbL82TUBiUNHmspXI5YUaAI38KOTMKxmNOgKqeS8FI5qzmTm1A" data-csrf="true" class="js-data-jump-to-suggestions-path-csrf" />


          </div>

        
          <div class="AppHeader-CopilotChat">
    <react-partial-anchor>
      <button id="copilot-chat-header-button" data-target="react-partial-anchor.anchor" data-hotkey="Shift+C" aria-expanded="false" aria-controls="copilot-chat-panel" aria-labelledby="tooltip-76ff6244-0180-4501-884c-3b7667150ca2" type="button" disabled="disabled" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button AppHeader-buttonLeft cursor-wait">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copilot Button-visual">
    <path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path>
</svg>
</button><tool-tip id="tooltip-76ff6244-0180-4501-884c-3b7667150ca2" for="copilot-chat-header-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Chat with Copilot</tool-tip>

      <template data-target="react-partial-anchor.template">
        <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_react-relay_index_js-3e4c69718bad.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_tanstack_query-core_build_modern_queryObserver_js-node_modules_tanstack_-defd52-843b41414e0e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_micromark-util-sanitize-uri_index_js-node_modules_remark-parse_lib_index-b69642-163efad98dc5.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_remark-gfm_lib_index_js-bfb9e2c9eabe.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_react-markdown_lib_index_js-2816acae350e.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_tanstack_react-query_build_modern_useQuery_js-node_modules_hast-util-fin-d142e3-fe0e76a2e3fe.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_mini-throttle_dist_decorators_js-node_modules_accname_dist_access-b37425-35bd8d94d981.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/vendors-node_modules_github_combobox-nav_dist_index_js-node_modules_github_hotkey_dist_index_-2c4211-a3b6ffd98cc6.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_item-picker_constants_labels_ts-ui_packages_item-picker_constants_values_ts-ui_pa-163a9a-ee6b1c4387f2.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_item-picker_components_RepositoryPicker_tsx-fed97f53635f.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_copilot-chat_utils_copilot-local-storage_ts-ui_packages_hydro-analytics_hydro-ana-74ad7c-cd6ac89814da.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_copilot-chat_utils_copilot-chat-hooks_ts-ui_packages_issue-viewer_utils_queries_ts-8a23643c08a1.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_test-id-props_test-id-props_ts-ui_packages_copilot-markdown_MarkdownRenderer_tsx--cd0d45-16709ea47eec.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/copilot-chat-07129b2860fa.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/copilot-chat.4e64150ee8c92ed63ef0.module.css" />
        <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/copilot-markdown-rendering-f6845e8f5d6b.css" />
        <include-fragment src="/github-copilot/chat?skip_anchor=true"></include-fragment>
      </template>
    </react-partial-anchor>
    <react-partial-anchor>
      <button id="global-copilot-menu-button" data-target="react-partial-anchor.anchor" aria-expanded="false" aria-labelledby="tooltip-5ef2b7ac-6993-4818-becb-564c5a7ff6fe" type="button" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button AppHeader-buttonRight">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-triangle-down Button-visual">
    <path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path>
</svg>
</button><tool-tip id="tooltip-5ef2b7ac-6993-4818-becb-564c5a7ff6fe" for="global-copilot-menu-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Open Copilot…</tool-tip>

      <template data-target="react-partial-anchor.template">
        <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/global-copilot-menu-f997b4b96fc7.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />

<react-partial
  partial-name="global-copilot-menu"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>

      </template>
    </react-partial-anchor>
</div>


        <div class="AppHeader-actions position-relative">
             <react-partial-anchor>
      <button id="global-create-menu-anchor" aria-label="Create something new" data-target="react-partial-anchor.anchor" type="button" disabled="disabled" data-view-component="true" class="AppHeader-button global-create-button cursor-wait Button--secondary Button--medium Button width-auto color-fg-muted">  <span class="Button-content">
      <span class="Button-visual Button-leadingVisual">
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-plus">
    <path d="M7.75 2a.75.75 0 0 1 .75.75V7h4.25a.75.75 0 0 1 0 1.5H8.5v4.25a.75.75 0 0 1-1.5 0V8.5H2.75a.75.75 0 0 1 0-1.5H7V2.75A.75.75 0 0 1 7.75 2Z"></path>
</svg>
      </span>
    <span class="Button-label"><svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-triangle-down">
    <path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path>
</svg></span>
  </span>
</button><tool-tip id="tooltip-ed5feb45-0058-4d48-889f-fb3ac351e82f" for="global-create-menu-anchor" popover="manual" data-direction="s" data-type="description" data-view-component="true" class="sr-only position-absolute">Create new...</tool-tip>

      <template data-target="react-partial-anchor.template">
        <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/ui_packages_promise-with-resolvers-polyfill_promise-with-resolvers-polyfill_ts-ui_packages_re-8d43b0-ae8dde838777.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/global-create-menu-7510a0ee7657.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />

<react-partial
  partial-name="global-create-menu"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{"createRepo":true,"importRepo":true,"codespaces":true,"gist":true,"createOrg":true,"createProject":false,"createProjectUrl":"/DominiqueLoyer?tab=projects","createLegacyProject":false,"createIssue":false,"org":null,"owner":"Alimiji","repo":"Solr_utilisation"}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>

      </template>
    </react-partial-anchor>


          <a href="/issues" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;ISSUES_HEADER&quot;,&quot;label&quot;:null}" id="icon-button-09e73c72-2cbc-4558-a2bf-cf207d9ca0cb" aria-labelledby="tooltip-9578566a-4e4f-4396-aa33-40d1d3b0cabc" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened Button-visual">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
</a><tool-tip id="tooltip-9578566a-4e4f-4396-aa33-40d1d3b0cabc" for="icon-button-09e73c72-2cbc-4558-a2bf-cf207d9ca0cb" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Your issues</tool-tip>

          <a href="/pulls" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;PULL_REQUESTS_HEADER&quot;,&quot;label&quot;:null}" id="icon-button-4c1d565c-b40d-471a-a91c-c8d37024cb02" aria-labelledby="tooltip-33a1455f-d205-4aeb-82d8-ca553d989408" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request Button-visual">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
</a><tool-tip id="tooltip-33a1455f-d205-4aeb-82d8-ca553d989408" for="icon-button-4c1d565c-b40d-471a-a91c-c8d37024cb02" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Your pull requests</tool-tip>

        </div>

        <notification-indicator data-channel="eyJjIjoibm90aWZpY2F0aW9uLWNoYW5nZWQ6MTA1MjI0OTIiLCJ0IjoxNzQxMjAwNjc1fQ==--0901f66d43c8d0ce4cd5ca93a0af6dabd04455e6ed4ab96f83841cc0cdfbce41" data-indicator-mode="none" data-tooltip-global="You have unread notifications" data-tooltip-unavailable="Notifications are unavailable at the moment." data-tooltip-none="You have no unread notifications" data-header-redesign-enabled="true" data-fetch-indicator-src="/notifications/indicator" data-fetch-indicator-enabled="true" data-view-component="true" class="js-socket-channel">
    <a id="AppHeader-notifications-button" href="/notifications" aria-labelledby="notification-indicator-tooltip" data-hotkey="g n" data-target="notification-indicator.link" data-analytics-event="{&quot;category&quot;:&quot;Global navigation&quot;,&quot;action&quot;:&quot;NOTIFICATIONS_HEADER&quot;,&quot;label&quot;:null}" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium AppHeader-button  color-fg-muted">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-inbox Button-visual">
    <path d="M2.8 2.06A1.75 1.75 0 0 1 4.41 1h7.18c.7 0 1.333.417 1.61 1.06l2.74 6.395c.04.093.06.194.06.295v4.5A1.75 1.75 0 0 1 14.25 15H1.75A1.75 1.75 0 0 1 0 13.25v-4.5c0-.101.02-.202.06-.295Zm1.61.44a.25.25 0 0 0-.23.152L1.887 8H4.75a.75.75 0 0 1 .6.3L6.625 10h2.75l1.275-1.7a.75.75 0 0 1 .6-.3h2.863L11.82 2.652a.25.25 0 0 0-.23-.152Zm10.09 7h-2.875l-1.275 1.7a.75.75 0 0 1-.6.3h-3.5a.75.75 0 0 1-.6-.3L4.375 9.5H1.5v3.75c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25Z"></path>
</svg>
</a>

    <tool-tip id="notification-indicator-tooltip" data-target="notification-indicator.tooltip" for="AppHeader-notifications-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Notifications</tool-tip>
</notification-indicator>

        <div class="AppHeader-user">
          <deferred-side-panel data-url="/_side-panels/user?repository_id=892964695">
  <include-fragment data-target="deferred-side-panel.fragment">
    <react-partial-anchor
  
>
  <button data-target="react-partial-anchor.anchor" data-login="DominiqueLoyer" aria-label="Open user navigation menu" type="button" data-view-component="true" class="cursor-wait Button--invisible Button--medium Button Button--invisible-noVisuals color-bg-transparent p-0">  <span class="Button-content">
    <span class="Button-label"><img src="https://avatars.githubusercontent.com/u/10522492?v=4" alt="" size="32" height="32" width="32" data-view-component="true" class="avatar circle" /></span>
  </span>
</button>
  <template data-target="react-partial-anchor.template">
    <script crossorigin="anonymous" defer="defer" type="application/javascript" src="https://github.githubassets.com/assets/global-user-nav-drawer-487d63bb6986.js"></script>
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/primer-react.23dd435fe36097326ec3.module.css" />
<link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/global-user-nav-drawer.830d6c10c9fea7fc134e.module.css" />

<react-partial
  partial-name="global-user-nav-drawer"
  data-ssr="false"
  data-attempted-ssr="false"
>
  
  <script type="application/json" data-target="react-partial.embeddedData">{"props":{"owner":{"login":"DominiqueLoyer","name":"D   Ф  m  i  И  i  q   ц  e L  Ф   y   e   r","avatarUrl":"https://avatars.githubusercontent.com/u/10522492?v=4"},"drawerId":"global-user-nav-drawer","lazyLoadItemDataFetchUrl":"/_side-panels/user.json","canAddAccount":true,"addAccountPath":"/login?add_account=1\u0026return_to=https%3A%2F%2Fgithub.com%2FAlimiji%2FSolr_utilisation%2Fblob%2Fmain%2Fextraire_fichiers.py","switchAccountPath":"/switch_account","loginAccountPath":"/login?add_account=1","projectsPath":"/DominiqueLoyer?tab=projects","gistsUrl":"https://gist.github.com/mine","docsUrl":"https://docs.github.com","yourEnterpriseUrl":null,"enterpriseSettingsUrl":null,"supportUrl":"https://support.github.com","showAccountSwitcher":true,"showCopilot":true,"showEnterprises":true,"showEnterprise":false,"showGists":true,"showOrganizations":true,"showSponsors":true,"showUpgrade":true,"showFeaturesPreviews":true,"showEnterpriseSettings":false,"createMenuProps":{"createRepo":true,"importRepo":true,"codespaces":true,"gist":true,"createOrg":true,"createProject":false,"createProjectUrl":"/DominiqueLoyer?tab=projects","createLegacyProject":false,"createIssue":false,"org":null,"owner":"Alimiji","repo":"Solr_utilisation"}}}</script>
  <div data-target="react-partial.reactRoot"></div>
</react-partial>

  </template>
</react-partial-anchor>

  </include-fragment>
</deferred-side-panel>
        </div>

        <div class="position-absolute mt-2">
            
<site-header-logged-in-user-menu>

</site-header-logged-in-user-menu>

        </div>
      </div>
    </div>


    
        <div class="AppHeader-localBar" >
          <nav data-pjax="#js-repo-pjax-container" aria-label="Repository" data-view-component="true" class="js-repo-nav js-sidenav-container-pjax js-responsive-underlinenav overflow-hidden UnderlineNav">

  <ul data-view-component="true" class="UnderlineNav-body list-style-none">
      <li data-view-component="true" class="d-inline-flex">
  <a id="code-tab" href="/Alimiji/Solr_utilisation" data-tab-item="i0code-tab" data-selected-links="repo_source repo_downloads repo_commits repo_releases repo_tags repo_branches repo_packages repo_deployments repo_attestations /Alimiji/Solr_utilisation" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g c" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Code&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code UnderlineNav-octicon d-none d-sm-inline">
    <path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215Zm-6.56 0a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
        <span data-content="Code">Code</span>
          <span id="code-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="Not available" data-view-component="true" class="Counter"></span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="issues-tab" href="/Alimiji/Solr_utilisation/issues" data-tab-item="i1issues-tab" data-selected-links="repo_issues repo_labels repo_milestones /Alimiji/Solr_utilisation/issues" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g i" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Issues&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened UnderlineNav-octicon d-none d-sm-inline">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
        <span data-content="Issues">Issues</span>
          <span id="issues-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="0" hidden="hidden" data-view-component="true" class="Counter">0</span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="pull-requests-tab" href="/Alimiji/Solr_utilisation/pulls" data-tab-item="i2pull-requests-tab" data-selected-links="repo_pulls checks /Alimiji/Solr_utilisation/pulls" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g p" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Pull requests&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request UnderlineNav-octicon d-none d-sm-inline">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
        <span data-content="Pull requests">Pull requests</span>
          <span id="pull-requests-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="0" hidden="hidden" data-view-component="true" class="Counter">0</span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="actions-tab" href="/Alimiji/Solr_utilisation/actions" data-tab-item="i3actions-tab" data-selected-links="repo_actions /Alimiji/Solr_utilisation/actions" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g a" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Actions&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-play UnderlineNav-octicon d-none d-sm-inline">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm4.879-2.773 4.264 2.559a.25.25 0 0 1 0 .428l-4.264 2.559A.25.25 0 0 1 6 10.559V5.442a.25.25 0 0 1 .379-.215Z"></path>
</svg>
        <span data-content="Actions">Actions</span>
          <span id="actions-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="Not available" data-view-component="true" class="Counter"></span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="projects-tab" href="/Alimiji/Solr_utilisation/projects" data-tab-item="i4projects-tab" data-selected-links="repo_projects new_repo_project repo_project /Alimiji/Solr_utilisation/projects" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g b" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Projects&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-table UnderlineNav-octicon d-none d-sm-inline">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25ZM6.5 6.5v8h7.75a.25.25 0 0 0 .25-.25V6.5Zm8-1.5V1.75a.25.25 0 0 0-.25-.25H6.5V5Zm-13 1.5v7.75c0 .138.112.25.25.25H5v-8ZM5 5V1.5H1.75a.25.25 0 0 0-.25.25V5Z"></path>
</svg>
        <span data-content="Projects">Projects</span>
          <span id="projects-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="0" hidden="hidden" data-view-component="true" class="Counter">0</span>


    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="security-tab" href="/Alimiji/Solr_utilisation/security" data-tab-item="i5security-tab" data-selected-links="security overview alerts policy token_scanning code_scanning /Alimiji/Solr_utilisation/security" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-hotkey="g s" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Security&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-shield UnderlineNav-octicon d-none d-sm-inline">
    <path d="M7.467.133a1.748 1.748 0 0 1 1.066 0l5.25 1.68A1.75 1.75 0 0 1 15 3.48V7c0 1.566-.32 3.182-1.303 4.682-.983 1.498-2.585 2.813-5.032 3.855a1.697 1.697 0 0 1-1.33 0c-2.447-1.042-4.049-2.357-5.032-3.855C1.32 10.182 1 8.566 1 7V3.48a1.75 1.75 0 0 1 1.217-1.667Zm.61 1.429a.25.25 0 0 0-.153 0l-5.25 1.68a.25.25 0 0 0-.174.238V7c0 1.358.275 2.666 1.057 3.86.784 1.194 2.121 2.34 4.366 3.297a.196.196 0 0 0 .154 0c2.245-.956 3.582-2.104 4.366-3.298C13.225 9.666 13.5 8.36 13.5 7V3.48a.251.251 0 0 0-.174-.237l-5.25-1.68ZM8.75 4.75v3a.75.75 0 0 1-1.5 0v-3a.75.75 0 0 1 1.5 0ZM9 10.5a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
        <span data-content="Security">Security</span>
          <include-fragment src="/Alimiji/Solr_utilisation/security/overall-count" accept="text/fragment+html"></include-fragment>

    
</a></li>
      <li data-view-component="true" class="d-inline-flex">
  <a id="insights-tab" href="/Alimiji/Solr_utilisation/pulse" data-tab-item="i6insights-tab" data-selected-links="repo_graphs repo_contributors dependency_graph dependabot_updates pulse people community /Alimiji/Solr_utilisation/pulse" data-pjax="#repo-content-pjax-container" data-turbo-frame="repo-content-turbo-frame" data-analytics-event="{&quot;category&quot;:&quot;Underline navbar&quot;,&quot;action&quot;:&quot;Click tab&quot;,&quot;label&quot;:&quot;Insights&quot;,&quot;target&quot;:&quot;UNDERLINE_NAV.TAB&quot;}" data-view-component="true" class="UnderlineNav-item no-wrap js-responsive-underlinenav-item js-selected-navigation-item">
    
              <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-graph UnderlineNav-octicon d-none d-sm-inline">
    <path d="M1.5 1.75V13.5h13.75a.75.75 0 0 1 0 1.5H.75a.75.75 0 0 1-.75-.75V1.75a.75.75 0 0 1 1.5 0Zm14.28 2.53-5.25 5.25a.75.75 0 0 1-1.06 0L7 7.06 4.28 9.78a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042l3.25-3.25a.75.75 0 0 1 1.06 0L10 7.94l4.72-4.72a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
        <span data-content="Insights">Insights</span>
          <span id="insights-repo-tab-count" data-pjax-replace="" data-turbo-replace="" title="Not available" data-view-component="true" class="Counter"></span>


    
</a></li>
</ul>
    <div style="visibility:hidden;" data-view-component="true" class="UnderlineNav-actions js-responsive-underlinenav-overflow position-absolute pr-3 pr-md-4 pr-lg-5 right-0">      <action-menu data-select-variant="none" data-view-component="true">
  <focus-group direction="vertical" mnemonics retain>
    <button id="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" popovertarget="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-overlay" aria-controls="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-list" aria-haspopup="true" aria-labelledby="tooltip-dadfd319-a6a7-4026-ba2c-925ac8df240a" type="button" data-view-component="true" class="Button Button--iconOnly Button--secondary Button--medium UnderlineNav-item">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-kebab-horizontal Button-visual">
    <path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path>
</svg>
</button><tool-tip id="tooltip-dadfd319-a6a7-4026-ba2c-925ac8df240a" for="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Additional navigation options</tool-tip>


<anchored-position data-target="action-menu.overlay" id="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-overlay" anchor="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" align="start" side="outside-bottom" anchor-offset="normal" popover="auto" data-view-component="true">
  <div data-view-component="true" class="Overlay Overlay--size-auto">
    
      <div data-view-component="true" class="Overlay-body Overlay-body--paddingNone">          <action-list>
  <div data-view-component="true">
    <ul aria-labelledby="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-button" id="action-menu-d3d15682-6fb4-4fed-b1e8-408eb58ef984-list" role="menu" data-view-component="true" class="ActionListWrap--inset ActionListWrap">
        <li hidden="hidden" data-menu-item="i0code-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-447cc7aa-1f1d-473b-8b22-dd49e5fdb4eb" href="/Alimiji/Solr_utilisation" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-code">
    <path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.749.749 0 0 1 .734.215Zm-6.56 0a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Code
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i1issues-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-6ce50039-f514-4487-9611-7e50fc0a2770" href="/Alimiji/Solr_utilisation/issues" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-issue-opened">
    <path d="M8 9.5a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path><path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Issues
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i2pull-requests-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-5331a6e8-a343-487c-86ce-cabff242c988" href="/Alimiji/Solr_utilisation/pulls" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-git-pull-request">
    <path d="M1.5 3.25a2.25 2.25 0 1 1 3 2.122v5.256a2.251 2.251 0 1 1-1.5 0V5.372A2.25 2.25 0 0 1 1.5 3.25Zm5.677-.177L9.573.677A.25.25 0 0 1 10 .854V2.5h1A2.5 2.5 0 0 1 13.5 5v5.628a2.251 2.251 0 1 1-1.5 0V5a1 1 0 0 0-1-1h-1v1.646a.25.25 0 0 1-.427.177L7.177 3.427a.25.25 0 0 1 0-.354ZM3.75 2.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm0 9.5a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Zm8.25.75a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Pull requests
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i3actions-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-e4b375d2-3534-4e26-a709-1a89e4dce0a4" href="/Alimiji/Solr_utilisation/actions" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-play">
    <path d="M8 0a8 8 0 1 1 0 16A8 8 0 0 1 8 0ZM1.5 8a6.5 6.5 0 1 0 13 0 6.5 6.5 0 0 0-13 0Zm4.879-2.773 4.264 2.559a.25.25 0 0 1 0 .428l-4.264 2.559A.25.25 0 0 1 6 10.559V5.442a.25.25 0 0 1 .379-.215Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Actions
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i4projects-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-2c57d4d6-8b45-47d6-b1b1-ccea2b3d0551" href="/Alimiji/Solr_utilisation/projects" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-table">
    <path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25ZM6.5 6.5v8h7.75a.25.25 0 0 0 .25-.25V6.5Zm8-1.5V1.75a.25.25 0 0 0-.25-.25H6.5V5Zm-13 1.5v7.75c0 .138.112.25.25.25H5v-8ZM5 5V1.5H1.75a.25.25 0 0 0-.25.25V5Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Projects
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i5security-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-7103d367-6e5a-4d9d-87ba-d22662d6b88c" href="/Alimiji/Solr_utilisation/security" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-shield">
    <path d="M7.467.133a1.748 1.748 0 0 1 1.066 0l5.25 1.68A1.75 1.75 0 0 1 15 3.48V7c0 1.566-.32 3.182-1.303 4.682-.983 1.498-2.585 2.813-5.032 3.855a1.697 1.697 0 0 1-1.33 0c-2.447-1.042-4.049-2.357-5.032-3.855C1.32 10.182 1 8.566 1 7V3.48a1.75 1.75 0 0 1 1.217-1.667Zm.61 1.429a.25.25 0 0 0-.153 0l-5.25 1.68a.25.25 0 0 0-.174.238V7c0 1.358.275 2.666 1.057 3.86.784 1.194 2.121 2.34 4.366 3.297a.196.196 0 0 0 .154 0c2.245-.956 3.582-2.104 4.366-3.298C13.225 9.666 13.5 8.36 13.5 7V3.48a.251.251 0 0 0-.174-.237l-5.25-1.68ZM8.75 4.75v3a.75.75 0 0 1-1.5 0v-3a.75.75 0 0 1 1.5 0ZM9 10.5a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Security
</span>      
</a>
  
</li>
        <li hidden="hidden" data-menu-item="i6insights-tab" data-targets="action-list.items" role="none" data-view-component="true" class="ActionListItem">
    
    
    <a tabindex="-1" id="item-f9ba8c15-48f3-4c6b-ad7a-49339f0de79b" href="/Alimiji/Solr_utilisation/pulse" role="menuitem" data-view-component="true" class="ActionListContent ActionListContent--visual16">
        <span class="ActionListItem-visual ActionListItem-visual--leading">
          <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-graph">
    <path d="M1.5 1.75V13.5h13.75a.75.75 0 0 1 0 1.5H.75a.75.75 0 0 1-.75-.75V1.75a.75.75 0 0 1 1.5 0Zm14.28 2.53-5.25 5.25a.75.75 0 0 1-1.06 0L7 7.06 4.28 9.78a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042l3.25-3.25a.75.75 0 0 1 1.06 0L10 7.94l4.72-4.72a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path>
</svg>
        </span>
      
        <span data-view-component="true" class="ActionListItem-label">
          Insights
</span>      
</a>
  
</li>
</ul>    
</div></action-list>


</div>
      
</div></anchored-position>  </focus-group>
</action-menu></div>
</nav>
        </div>
</header>


      <div hidden="hidden" data-view-component="true" class="js-stale-session-flash stale-session-flash flash flash-warn flash-full">
  
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-alert">
    <path d="M6.457 1.047c.659-1.234 2.427-1.234 3.086 0l6.082 11.378A1.75 1.75 0 0 1 14.082 15H1.918a1.75 1.75 0 0 1-1.543-2.575Zm1.763.707a.25.25 0 0 0-.44 0L1.698 13.132a.25.25 0 0 0 .22.368h12.164a.25.25 0 0 0 .22-.368Zm.53 3.996v2.5a.75.75 0 0 1-1.5 0v-2.5a.75.75 0 0 1 1.5 0ZM9 11a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
        <span class="js-stale-session-flash-signed-in" hidden>You signed in with another tab or window. <a class="Link--inTextBlock" href="">Reload</a> to refresh your session.</span>
        <span class="js-stale-session-flash-signed-out" hidden>You signed out in another tab or window. <a class="Link--inTextBlock" href="">Reload</a> to refresh your session.</span>
        <span class="js-stale-session-flash-switched" hidden>You switched accounts on another tab or window. <a class="Link--inTextBlock" href="">Reload</a> to refresh your session.</span>

    <button id="icon-button-e0c8af87-0725-484e-a33e-7b75baf9efdc" aria-labelledby="tooltip-46b6559d-b6fb-476c-b0f4-18aa3691ae5c" type="button" data-view-component="true" class="Button Button--iconOnly Button--invisible Button--medium flash-close js-flash-close">  <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x Button-visual">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
</button><tool-tip id="tooltip-46b6559d-b6fb-476c-b0f4-18aa3691ae5c" for="icon-button-e0c8af87-0725-484e-a33e-7b75baf9efdc" popover="manual" data-direction="s" data-type="label" data-view-component="true" class="sr-only position-absolute">Dismiss alert</tool-tip>


  
</div>
          
    </div>

  <div id="start-of-content" class="show-on-focus"></div>








    <div id="js-flash-container" class="flash-container" data-turbo-replace>




  <template class="js-flash-template">
    
<div class="flash flash-full   {{ className }}">
  <div >
    <button autofocus class="flash-close js-flash-close" type="button" aria-label="Dismiss this message">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
    </button>
    <div aria-atomic="true" role="alert" class="js-flash-alert">
      
      <div>{{ message }}</div>

    </div>
  </div>
</div>
  </template>
</div>


    
  <notification-shelf-watcher data-base-url="https://github.com/notifications/beta/shelf" data-channel="eyJjIjoibm90aWZpY2F0aW9uLWNoYW5nZWQ6MTA1MjI0OTIiLCJ0IjoxNzQxMjAwNjc1fQ==--0901f66d43c8d0ce4cd5ca93a0af6dabd04455e6ed4ab96f83841cc0cdfbce41" data-view-component="true" class="js-socket-channel"></notification-shelf-watcher>
  <div hidden data-initial data-target="notification-shelf-watcher.placeholder"></div>






  <div
    class="application-main "
    data-commit-hovercards-enabled
    data-discussion-hovercards-enabled
    data-issue-and-pr-hovercards-enabled
    data-project-hovercards-enabled
  >
        <div itemscope itemtype="http://schema.org/SoftwareSourceCode" class="">
    <main id="js-repo-pjax-container" >
      
      






    
  <div id="repository-container-header" data-turbo-replace hidden ></div>




<turbo-frame id="repo-content-turbo-frame" target="_top" data-turbo-action="advance" class="">
    <div id="repo-content-pjax-container" class="repository-content " >
      <a href="https://github.dev/" class="d-none js-github-dev-shortcut" data-hotkey=".,Mod+Alt+.">Open in github.dev</a>
  <a href="https://github.dev/" class="d-none js-github-dev-new-tab-shortcut" data-hotkey="Shift+.,Shift+&gt;,&gt;" target="_blank" rel="noopener noreferrer">Open in a new github.dev tab</a>
    <a class="d-none" data-hotkey=",,Mod+Alt+," target="_blank" href="/codespaces/new/Alimiji/Solr_utilisation/tree/main?resume=1">Open in codespace</a>




    
      
    








<react-app
  app-name="react-code-view"
  initial-path="/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py"
    style="display: block; min-height: calc(100vh - 64px);"
  data-attempted-ssr="true"
  data-ssr="true"
  data-lazy="false"
  data-alternate="false"
  data-data-router-enabled="false"
>
  
  <script type="application/json" data-target="react-app.embeddedData">{"payload":{"allShortcutsEnabled":true,"fileTree":{"":{"items":[{"name":"RI_PySolr (1).pdf","path":"RI_PySolr (1).pdf","contentType":"file"},{"name":"commande_curl_solr.pdf","path":"commande_curl_solr.pdf","contentType":"file"},{"name":"extraire_fichiers.py","path":"extraire_fichiers.py","contentType":"file"},{"name":"main.py","path":"main.py","contentType":"file"},{"name":"notes.txt","path":"notes.txt","contentType":"file"},{"name":"precision_recall.py","path":"precision_recall.py","contentType":"file"},{"name":"requete_resultat_solr.py","path":"requete_resultat_solr.py","contentType":"file"},{"name":"requetes.py","path":"requetes.py","contentType":"file"}],"totalCount":8}},"fileTreeProcessingTime":8.986566,"foldersToFetch":[],"repo":{"id":892964695,"defaultBranch":"main","name":"Solr_utilisation","ownerLogin":"Alimiji","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2024-11-23T01:55:01.000-05:00","ownerAvatar":"https://avatars.githubusercontent.com/u/60366981?v=4","public":true,"private":false,"isOrgOwned":false},"codeLineWrapEnabled":false,"symbolsExpanded":true,"treeExpanded":true,"refInfo":{"name":"main","listCacheKey":"v0:1732795331.0","canEdit":true,"refType":"branch","currentOid":"cc5f36a532d1b33c2bf129f168d2219e9539a097"},"path":"extraire_fichiers.py","currentUser":{"id":10522492,"login":"DominiqueLoyer","userEmail":"loyer.dominique@courrier.uqam.ca"},"blob":{"rawLines":["import json","import os","import gzip","import shutil","from requetes import extraire_requetes_longues, extraire_requetes_courtes","","import os","import gzip","import shutil","","import os","import gzip","import shutil","","# Chemins des dossiers","input_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP'","output_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP_ok'","","# Vérifie que le dossier de sortie existe, sinon le crée","os.makedirs(output_folder, exist_ok=True)","","import os","","import os","import chardet","","# Chemins des dossiers","input_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP__ok'","output_folder = '/home/alimijileking/PycharmProjects/Solr_project/AP_fixed'","","# Crée le dossier de sortie s'il n'existe pas","os.makedirs(output_folder, exist_ok=True)","","","# Fonction pour détecter l'encodage","def detect_encoding(file_path):","    with open(file_path, 'rb') as f:","        result = chardet.detect(f.read())","        return result['encoding']","","","# Fonction pour transformer un document en structure XML Solr","def transform_document(lines):","    doc_lines = []","    doc_lines.append(\"  \u003cdoc\u003e\")  # Début du document","    for line in lines:","        line = line.strip()","        if line.startswith(\"\u003cDOCNO\u003e\"):","            content = line.replace(\"\u003cDOCNO\u003e\", \"\").replace(\"\u003c/DOCNO\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"DOCNO\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cFILEID\u003e\"):","            content = line.replace(\"\u003cFILEID\u003e\", \"\").replace(\"\u003c/FILEID\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"FILEID\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cFIRST\u003e\"):","            content = line.replace(\"\u003cFIRST\u003e\", \"\").replace(\"\u003c/FIRST\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"FIRST\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cSECOND\u003e\"):","            content = line.replace(\"\u003cSECOND\u003e\", \"\").replace(\"\u003c/SECOND\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"SECOND\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cHEAD\u003e\"):","            content = line.replace(\"\u003cHEAD\u003e\", \"\").replace(\"\u003c/HEAD\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"HEAD\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cDATELINE\u003e\"):","            content = line.replace(\"\u003cDATELINE\u003e\", \"\").replace(\"\u003c/DATELINE\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"DATELINE\\\"\u003e{content}\u003c/field\u003e\")","        elif line.startswith(\"\u003cTEXT\u003e\"):","            content = line.replace(\"\u003cTEXT\u003e\", \"\").replace(\"\u003c/TEXT\u003e\", \"\").strip()","            doc_lines.append(f\"    \u003cfield name=\\\"TEXT\\\"\u003e{content}\u003c/field\u003e\")","    doc_lines.append(\"  \u003c/doc\u003e\")  # Fin du document","    return doc_lines","","","# Parcourt tous les fichiers XML dans le dossier d'entrée","for filename in os.listdir(input_folder):","    if filename.endswith('.xml'):","        input_path = os.path.join(input_folder, filename)","        output_path = os.path.join(output_folder, filename)","","        try:","            # Détecte l'encodage","            encoding = detect_encoding(input_path)","            print(f\"Encodage détecté pour {filename}: {encoding}\")","","            # Lit le fichier avec l'encodage détecté","            with open(input_path, 'r', encoding=encoding, errors='ignore') as file:","                lines = file.readlines()","","            # Transforme et écrit dans le fichier de sortie","            with open(output_path, 'w', encoding='utf-8') as out_file:","                out_file.write(\"\u003cadd\u003e\\n\")  # Début de la racine Solr","                current_doc = []","                in_doc = False","                for line in lines:","                    if \"\u003cDOC\u003e\" in line:  # Début d'un document","                        in_doc = True","                        current_doc = []","                    elif \"\u003c/DOC\u003e\" in line:  # Fin d'un document","                        in_doc = False","                        # Transforme le document et l'écrit dans le fichier","                        transformed_doc = transform_document(current_doc)","                        out_file.write(\"\\n\".join(transformed_doc) + \"\\n\")","                    elif in_doc:","                        current_doc.append(line)","                out_file.write(\"\u003c/add\u003e\\n\")  # Fin de la racine Solr","","            print(f\"Fichier corrigé et converti : {output_path}\")","","        except Exception as e:","            print(f\"Erreur lors du traitement de {filename}: {e}\")","","\"\"\"","# Parcourt tous les fichiers du dossier d'entrée","for filename in os.listdir(input_folder):","    if filename.endswith('.gz'):  # Vérifie si le fichier est au format .gz","        input_path = os.path.join(input_folder, filename)","        output_path = os.path.join(output_folder, filename[:-3] + '.xml')  # Supprime '.gz' et ajoute '.xml'","","        # Décompresse le fichier et l'écrit directement avec l'extension .xml","        with gzip.open(input_path, 'rb') as f_in:","            with open(output_path, 'wb') as f_out:","                shutil.copyfileobj(f_in, f_out)","","        print(f\"Fichier extrait et enregistré en XML : {output_path}\")\"\"\"","","\"\"\"","","","","","# Creation des requetes longues","","# Liste des fichiers à traiter","files = ['Topics-requetes/topics.1-50.txt', 'Topics-requetes/topics.51-100.txt', 'Topics-requetes/topics.101-150.txt']","","# Dictionnaire pour stocker les résultats","def lire_fichier(filepath):","    with open(filepath, 'r', encoding='utf-8') as file:","        return file.read()","req_longues_combines = {}","","req_courtes_combines = {}","","for fichier in files:","    data = lire_fichier(fichier)","    resultat = extraire_requetes_longues(data)","    req_longues_combines.update(resultat)","","for fichier in files:","    data = lire_fichier(fichier)","    resultat = extraire_requetes_courtes(data)","    req_courtes_combines.update(resultat)","\"\"\"","# Afficher les résultats combinés","#print(resultats_combines)","","# Convertion en fichier json","","# Conversion en fichier JSON","\"\"\"","with open('requetes/requetes_longues.json', 'w', encoding='utf-8') as fichier_json:","    json.dump( req_longues_combines, fichier_json, ensure_ascii=False, indent=4)","","","# Creation des requetes courtes","","with open('requetes/requetes_courtes.json', 'w', encoding='utf-8') as fichier_json:","    json.dump( req_courtes_combines, fichier_json, ensure_ascii=False, indent=4)","","\"\"\""],"stylingDirectives":[[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,13,"pl-s1"]],[[0,4,"pl-k"],[5,13,"pl-s1"],[14,20,"pl-k"],[21,46,"pl-s1"],[48,73,"pl-s1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,13,"pl-s1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,11,"pl-s1"]],[[0,6,"pl-k"],[7,13,"pl-s1"]],[],[[0,22,"pl-c"]],[[0,12,"pl-s1"],[13,14,"pl-c1"],[15,68,"pl-s"]],[[0,13,"pl-s1"],[14,15,"pl-c1"],[16,72,"pl-s"]],[],[[0,56,"pl-c"]],[[0,2,"pl-s1"],[3,11,"pl-c1"],[12,25,"pl-s1"],[27,35,"pl-s1"],[35,36,"pl-c1"],[36,40,"pl-c1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[],[[0,6,"pl-k"],[7,9,"pl-s1"]],[[0,6,"pl-k"],[7,14,"pl-s1"]],[],[[0,22,"pl-c"]],[[0,12,"pl-s1"],[13,14,"pl-c1"],[15,72,"pl-s"]],[[0,13,"pl-s1"],[14,15,"pl-c1"],[16,75,"pl-s"]],[],[[0,45,"pl-c"]],[[0,2,"pl-s1"],[3,11,"pl-c1"],[12,25,"pl-s1"],[27,35,"pl-s1"],[35,36,"pl-c1"],[36,40,"pl-c1"]],[],[],[[0,35,"pl-c"]],[[0,3,"pl-k"],[4,19,"pl-en"],[20,29,"pl-s1"]],[[4,8,"pl-k"],[9,13,"pl-en"],[14,23,"pl-s1"],[25,29,"pl-s"],[31,33,"pl-k"],[34,35,"pl-s1"]],[[8,14,"pl-s1"],[15,16,"pl-c1"],[17,24,"pl-s1"],[25,31,"pl-c1"],[32,33,"pl-s1"],[34,38,"pl-c1"]],[[8,14,"pl-k"],[15,21,"pl-s1"],[22,32,"pl-s"]],[],[],[[0,61,"pl-c"]],[[0,3,"pl-k"],[4,22,"pl-en"],[23,28,"pl-s1"]],[[4,13,"pl-s1"],[14,15,"pl-c1"]],[[4,13,"pl-s1"],[14,20,"pl-c1"],[21,30,"pl-s"],[33,52,"pl-c"]],[[4,7,"pl-k"],[8,12,"pl-s1"],[13,15,"pl-c1"],[16,21,"pl-s1"]],[[8,12,"pl-s1"],[13,14,"pl-c1"],[15,19,"pl-s1"],[20,25,"pl-c1"]],[[8,10,"pl-k"],[11,15,"pl-s1"],[16,26,"pl-c1"],[27,36,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,44,"pl-s"],[46,48,"pl-s"],[50,57,"pl-c1"],[58,68,"pl-s"],[70,72,"pl-s"],[74,79,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,75,"pl-s"],[47,49,"pl-cce"],[54,56,"pl-cce"],[57,66,"pl-s1"],[57,58,"pl-kos"],[58,65,"pl-s1"],[65,66,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,39,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,45,"pl-s"],[47,49,"pl-s"],[51,58,"pl-c1"],[59,70,"pl-s"],[72,74,"pl-s"],[76,81,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,76,"pl-s"],[47,49,"pl-cce"],[55,57,"pl-cce"],[58,67,"pl-s1"],[58,59,"pl-kos"],[59,66,"pl-s1"],[66,67,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,38,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,44,"pl-s"],[46,48,"pl-s"],[50,57,"pl-c1"],[58,68,"pl-s"],[70,72,"pl-s"],[74,79,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,75,"pl-s"],[47,49,"pl-cce"],[54,56,"pl-cce"],[57,66,"pl-s1"],[57,58,"pl-kos"],[58,65,"pl-s1"],[65,66,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,39,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,45,"pl-s"],[47,49,"pl-s"],[51,58,"pl-c1"],[59,70,"pl-s"],[72,74,"pl-s"],[76,81,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,76,"pl-s"],[47,49,"pl-cce"],[55,57,"pl-cce"],[58,67,"pl-s1"],[58,59,"pl-kos"],[59,66,"pl-s1"],[66,67,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,37,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,43,"pl-s"],[45,47,"pl-s"],[49,56,"pl-c1"],[57,66,"pl-s"],[68,70,"pl-s"],[72,77,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,74,"pl-s"],[47,49,"pl-cce"],[53,55,"pl-cce"],[56,65,"pl-s1"],[56,57,"pl-kos"],[57,64,"pl-s1"],[64,65,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,41,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,47,"pl-s"],[49,51,"pl-s"],[53,60,"pl-c1"],[61,74,"pl-s"],[76,78,"pl-s"],[80,85,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,78,"pl-s"],[47,49,"pl-cce"],[57,59,"pl-cce"],[60,69,"pl-s1"],[60,61,"pl-kos"],[61,68,"pl-s1"],[68,69,"pl-kos"]],[[8,12,"pl-k"],[13,17,"pl-s1"],[18,28,"pl-c1"],[29,37,"pl-s"]],[[12,19,"pl-s1"],[20,21,"pl-c1"],[22,26,"pl-s1"],[27,34,"pl-c1"],[35,43,"pl-s"],[45,47,"pl-s"],[49,56,"pl-c1"],[57,66,"pl-s"],[68,70,"pl-s"],[72,77,"pl-c1"]],[[12,21,"pl-s1"],[22,28,"pl-c1"],[29,74,"pl-s"],[47,49,"pl-cce"],[53,55,"pl-cce"],[56,65,"pl-s1"],[56,57,"pl-kos"],[57,64,"pl-s1"],[64,65,"pl-kos"]],[[4,13,"pl-s1"],[14,20,"pl-c1"],[21,31,"pl-s"],[34,51,"pl-c"]],[[4,10,"pl-k"],[11,20,"pl-s1"]],[],[],[[0,57,"pl-c"]],[[0,3,"pl-k"],[4,12,"pl-s1"],[13,15,"pl-c1"],[16,18,"pl-s1"],[19,26,"pl-c1"],[27,39,"pl-s1"]],[[4,6,"pl-k"],[7,15,"pl-s1"],[16,24,"pl-c1"],[25,31,"pl-s"]],[[8,18,"pl-s1"],[19,20,"pl-c1"],[21,23,"pl-s1"],[24,28,"pl-c1"],[29,33,"pl-c1"],[34,46,"pl-s1"],[48,56,"pl-s1"]],[[8,19,"pl-s1"],[20,21,"pl-c1"],[22,24,"pl-s1"],[25,29,"pl-c1"],[30,34,"pl-c1"],[35,48,"pl-s1"],[50,58,"pl-s1"]],[],[[8,11,"pl-k"]],[[12,32,"pl-c"]],[[12,20,"pl-s1"],[21,22,"pl-c1"],[23,38,"pl-en"],[39,49,"pl-s1"]],[[12,17,"pl-en"],[18,65,"pl-s"],[42,52,"pl-s1"],[42,43,"pl-kos"],[43,51,"pl-s1"],[51,52,"pl-kos"],[54,64,"pl-s1"],[54,55,"pl-kos"],[55,63,"pl-s1"],[63,64,"pl-kos"]],[],[[12,52,"pl-c"]],[[12,16,"pl-k"],[17,21,"pl-en"],[22,32,"pl-s1"],[34,37,"pl-s"],[39,47,"pl-s1"],[47,48,"pl-c1"],[48,56,"pl-s1"],[58,64,"pl-s1"],[64,65,"pl-c1"],[65,73,"pl-s"],[75,77,"pl-k"],[78,82,"pl-s1"]],[[16,21,"pl-s1"],[22,23,"pl-c1"],[24,28,"pl-s1"],[29,38,"pl-c1"]],[],[[12,59,"pl-c"]],[[12,16,"pl-k"],[17,21,"pl-en"],[22,33,"pl-s1"],[35,38,"pl-s"],[40,48,"pl-s1"],[48,49,"pl-c1"],[49,56,"pl-s"],[58,60,"pl-k"],[61,69,"pl-s1"]],[[16,24,"pl-s1"],[25,30,"pl-c1"],[31,40,"pl-s"],[37,39,"pl-cce"],[43,68,"pl-c"]],[[16,27,"pl-s1"],[28,29,"pl-c1"]],[[16,22,"pl-s1"],[23,24,"pl-c1"],[25,30,"pl-c1"]],[[16,19,"pl-k"],[20,24,"pl-s1"],[25,27,"pl-c1"],[28,33,"pl-s1"]],[[20,22,"pl-k"],[23,30,"pl-s"],[31,33,"pl-c1"],[34,38,"pl-s1"],[41,62,"pl-c"]],[[24,30,"pl-s1"],[31,32,"pl-c1"],[33,37,"pl-c1"]],[[24,35,"pl-s1"],[36,37,"pl-c1"]],[[20,24,"pl-k"],[25,33,"pl-s"],[34,36,"pl-c1"],[37,41,"pl-s1"],[44,63,"pl-c"]],[[24,30,"pl-s1"],[31,32,"pl-c1"],[33,38,"pl-c1"]],[[24,75,"pl-c"]],[[24,39,"pl-s1"],[40,41,"pl-c1"],[42,60,"pl-en"],[61,72,"pl-s1"]],[[24,32,"pl-s1"],[33,38,"pl-c1"],[39,43,"pl-s"],[40,42,"pl-cce"],[44,48,"pl-c1"],[49,64,"pl-s1"],[66,67,"pl-c1"],[68,72,"pl-s"],[69,71,"pl-cce"]],[[20,24,"pl-k"],[25,31,"pl-s1"]],[[24,35,"pl-s1"],[36,42,"pl-c1"],[43,47,"pl-s1"]],[[16,24,"pl-s1"],[25,30,"pl-c1"],[31,41,"pl-s"],[38,40,"pl-cce"],[44,67,"pl-c"]],[],[[12,17,"pl-en"],[18,64,"pl-s"],[50,63,"pl-s1"],[50,51,"pl-kos"],[51,62,"pl-s1"],[62,63,"pl-kos"]],[],[[8,14,"pl-k"],[15,24,"pl-v"],[25,27,"pl-k"],[28,29,"pl-s1"]],[[12,17,"pl-en"],[18,65,"pl-s"],[49,59,"pl-s1"],[49,50,"pl-kos"],[50,58,"pl-s1"],[58,59,"pl-kos"],[61,64,"pl-s1"],[61,62,"pl-kos"],[62,63,"pl-s1"],[63,64,"pl-kos"]],[],[[0,3,"pl-s"]],[[0,48,"pl-s"]],[[0,41,"pl-s"]],[[0,75,"pl-s"]],[[0,57,"pl-s"]],[[0,108,"pl-s"]],[[0,0,"pl-s"]],[[0,77,"pl-s"]],[[0,49,"pl-s"]],[[0,50,"pl-s"]],[[0,47,"pl-s"]],[[0,0,"pl-s"]],[[0,73,"pl-s"]],[],[[0,3,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,31,"pl-s"]],[[0,0,"pl-s"]],[[0,30,"pl-s"]],[[0,118,"pl-s"]],[[0,0,"pl-s"]],[[0,41,"pl-s"]],[[0,27,"pl-s"]],[[0,55,"pl-s"]],[[0,26,"pl-s"]],[[0,25,"pl-s"]],[[0,0,"pl-s"]],[[0,25,"pl-s"]],[[0,0,"pl-s"]],[[0,21,"pl-s"]],[[0,32,"pl-s"]],[[0,46,"pl-s"]],[[0,41,"pl-s"]],[[0,0,"pl-s"]],[[0,21,"pl-s"]],[[0,32,"pl-s"]],[[0,46,"pl-s"]],[[0,41,"pl-s"]],[[0,3,"pl-s"]],[[0,33,"pl-c"]],[[0,26,"pl-c"]],[],[[0,28,"pl-c"]],[],[[0,28,"pl-c"]],[[0,3,"pl-s"]],[[0,83,"pl-s"]],[[0,80,"pl-s"]],[[0,0,"pl-s"]],[[0,0,"pl-s"]],[[0,31,"pl-s"]],[[0,0,"pl-s"]],[[0,83,"pl-s"]],[[0,80,"pl-s"]],[[0,0,"pl-s"]],[[0,3,"pl-s"]],[],[]],"colorizedLines":null,"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/Alimiji/Solr_utilisation/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":false},"displayName":"extraire_fichiers.py","displayUrl":"https://github.com/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py?raw=true","headerInfo":{"blobSize":"6.04 KB","deleteTooltip":"Delete the file in your fork of this project","editTooltip":"Edit the file in your fork of this project","ghDesktopPath":"x-github-client://openRepo/https://github.com/Alimiji/Solr_utilisation?branch=main\u0026filepath=extraire_fichiers.py","isGitLfs":false,"onBranch":true,"shortPath":"b01fd91","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2FAlimiji%2FSolr_utilisation%2Fblob%2Fmain%2Fextraire_fichiers.py","isCSV":false,"isRichtext":false,"toc":null,"lineInfo":{"truncatedLoc":"171","truncatedSloc":"131"},"mode":"file"},"image":false,"isCodeownersFile":null,"isPlain":false,"isValidLegacyIssueTemplate":false,"issueTemplate":null,"discussionTemplate":null,"language":"Python","languageID":303,"large":false,"planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/Alimiji/Solr_utilisation/blob/main/extraire_fichiers.py","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","releasePath":"/Alimiji/Solr_utilisation/releases/new?marketplace=true","showPublishActionBanner":false},"rawBlobUrl":"https://github.com/Alimiji/Solr_utilisation/raw/refs/heads/main/extraire_fichiers.py","renderImageOrRaw":false,"richText":null,"renderedFileInfo":null,"shortPath":null,"symbolsEnabled":true,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","actionsOnboardingTip":null},"truncated":false,"viewable":true,"workflowRedirectUrl":null,"symbols":{"timed_out":false,"not_analyzed":false,"symbols":[{"name":"input_folder","kind":"constant","ident_start":220,"ident_end":232,"extent_start":220,"extent_end":288,"fully_qualified_name":"input_folder","ident_utf16":{"start":{"line_number":15,"utf16_col":0},"end":{"line_number":15,"utf16_col":12}},"extent_utf16":{"start":{"line_number":15,"utf16_col":0},"end":{"line_number":15,"utf16_col":68}}},{"name":"output_folder","kind":"constant","ident_start":289,"ident_end":302,"extent_start":289,"extent_end":361,"fully_qualified_name":"output_folder","ident_utf16":{"start":{"line_number":16,"utf16_col":0},"end":{"line_number":16,"utf16_col":13}},"extent_utf16":{"start":{"line_number":16,"utf16_col":0},"end":{"line_number":16,"utf16_col":72}}},{"name":"input_folder","kind":"constant","ident_start":525,"ident_end":537,"extent_start":525,"extent_end":597,"fully_qualified_name":"input_folder","ident_utf16":{"start":{"line_number":27,"utf16_col":0},"end":{"line_number":27,"utf16_col":12}},"extent_utf16":{"start":{"line_number":27,"utf16_col":0},"end":{"line_number":27,"utf16_col":72}}},{"name":"output_folder","kind":"constant","ident_start":598,"ident_end":611,"extent_start":598,"extent_end":673,"fully_qualified_name":"output_folder","ident_utf16":{"start":{"line_number":28,"utf16_col":0},"end":{"line_number":28,"utf16_col":13}},"extent_utf16":{"start":{"line_number":28,"utf16_col":0},"end":{"line_number":28,"utf16_col":75}}},{"name":"detect_encoding","kind":"function","ident_start":807,"ident_end":822,"extent_start":803,"extent_end":947,"fully_qualified_name":"detect_encoding","ident_utf16":{"start":{"line_number":35,"utf16_col":4},"end":{"line_number":35,"utf16_col":19}},"extent_utf16":{"start":{"line_number":35,"utf16_col":0},"end":{"line_number":38,"utf16_col":33}}},{"name":"transform_document","kind":"function","ident_start":1016,"ident_end":1034,"extent_start":1012,"extent_end":2649,"fully_qualified_name":"transform_document","ident_utf16":{"start":{"line_number":42,"utf16_col":4},"end":{"line_number":42,"utf16_col":22}},"extent_utf16":{"start":{"line_number":42,"utf16_col":0},"end":{"line_number":69,"utf16_col":20}}}]}},"copilotInfo":null,"copilotAccessAllowed":true,"modelsAccessAllowed":false,"csrf_tokens":{"/Alimiji/Solr_utilisation/branches":{"post":"u-AIegMwVHgrZih-2iQMaFPTafhn0fo66DlWp0BaSXgG20lvy-QDH-psuH269hin7cACmCFXZycmUhZp0Jyz9A"},"/repos/preferences":{"post":"qbpPXnF_H2OtYY1ieaCfX3EOKuhFgX6re0kxJYUXy1Z67sVArhAiFcU2p8r_MZH-sNHzpRdoF3XP547gy6wrgg"}}},"title":"Solr_utilisation/extraire_fichiers.py at main · Alimiji/Solr_utilisation","appPayload":{"helpUrl":"https://docs.github.com","findFileWorkerPath":"/assets-cdn/worker/find-file-worker-7d7eb7c71814.js","findInFileWorkerPath":"/assets-cdn/worker/find-in-file-worker-96e76d5fdb2c.js","githubDevUrl":"https://github.dev/","enabled_features":{"code_nav_ui_events":false,"overview_shared_code_dropdown_button":false,"react_blob_overlay":true,"copilot_conversational_ux_embedding_update":false,"copilot_smell_icebreaker_ux":true,"accessible_code_button":true}}}</script>
  <div data-target="react-app.reactRoot"><style data-styled="true" data-styled-version="5.3.11">.hOfjFo{padding:0;}/*!sc*/
.oDGAe{max-width:100%;margin-left:auto;margin-right:auto;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;}/*!sc*/
.kowOcT{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex:1 1 100%;-ms-flex:1 1 100%;flex:1 1 100%;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;max-width:100%;}/*!sc*/
.gISSDQ{width:100%;}/*!sc*/
@media screen and (min-width:544px){.gISSDQ{width:100%;}}/*!sc*/
@media screen and (min-width:768px){.gISSDQ{width:auto;}}/*!sc*/
.fHCyST{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-order:1;-ms-flex-order:1;order:1;width:100%;margin-left:0;margin-right:0;-webkit-flex-direction:column-reverse;-ms-flex-direction:column-reverse;flex-direction:column-reverse;margin-bottom:0;min-width:0;}/*!sc*/
@media screen and (min-width:768px){.fHCyST{width:auto;margin-top:0 !important;margin-bottom:0 !important;position:-webkit-sticky;position:sticky;top:0px;max-height:100vh !important;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;margin-right:0;height:100vh;}}/*!sc*/
@media print,screen and (max-width:1349px) and (min-width:768px){.fHCyST{display:none;}}/*!sc*/
.hPvFuC{margin-left:0;margin-right:0;display:none;margin-top:0;}/*!sc*/
@media screen and (min-width:768px){.hPvFuC{margin-left:0 !important;margin-right:0 !important;}}/*!sc*/
.fFSoPl{--pane-min-width:256px;--pane-max-width-diff:511px;--pane-max-width:calc(100vw - var(--pane-max-width-diff));width:100%;padding:0;}/*!sc*/
@media screen and (min-width:544px){}/*!sc*/
@media screen and (min-width:768px){.fFSoPl{width:clamp(var(--pane-min-width),var(--pane-width),var(--pane-max-width));overflow:auto;}}/*!sc*/
@media screen and (min-width:1280px){.fFSoPl{--pane-max-width-diff:959px;}}/*!sc*/
.birIjn{max-height:100%;height:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;}/*!sc*/
@media screen and (max-width:768px){.birIjn{display:none;}}/*!sc*/
@media screen and (min-width:768px){.birIjn{max-height:100vh;height:100vh;}}/*!sc*/
.hNNCwk{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;padding-left:16px;padding-right:16px;padding-bottom:8px;padding-top:16px;}/*!sc*/
.jfIeyl{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;margin-bottom:16px;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.XosP{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;font-size:14px;}/*!sc*/
.hMLRgO[data-size="medium"]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));padding-left:8px;padding-right:8px;display:none;}/*!sc*/
@media screen and (max-width:768px){.hMLRgO[data-size="medium"]{display:block;}}/*!sc*/
.gUkoLg{-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;}/*!sc*/
.kOkWgo{font-size:16px;margin-left:8px;}/*!sc*/
.lhbroM{margin-left:24px;margin-right:24px;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;}/*!sc*/
.khzwtX{-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;}/*!sc*/
.JMXqM[data-size="medium"]{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;min-width:0;}/*!sc*/
.JMXqM[data-size="medium"] svg{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.JMXqM[data-size="medium"] > span{width:inherit;}/*!sc*/
.bZBlpz{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;}/*!sc*/
.bJjzmO{margin-right:4px;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.ffLUq{font-size:14px;min-width:0;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;}/*!sc*/
.eTeVqd{margin-left:8px;white-space:nowrap;}/*!sc*/
.eTeVqd:hover button:not(:hover){border-left-color:var(--button-default-borderColor-hover,var(--color-btn-hover-border));}/*!sc*/
.jNHrPP[data-size="medium"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-subtle,#6e7681));border-top-right-radius:0;border-bottom-right-radius:0;border-right:0;}/*!sc*/
.ijefGF[data-size="medium"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-subtle,#6e7681));font-size:14px;font-weight:400;-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;border-top-left-radius:0;border-bottom-left-radius:0;}/*!sc*/
.ftzGWg{margin-left:16px;margin-right:16px;margin-bottom:12px;}/*!sc*/
@media screen and (max-width:768px){.ftzGWg{display:none;}}/*!sc*/
.dItACB{margin-right:-6px;}/*!sc*/
.gjtfVk{-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;max-height:100% !important;overflow-y:auto;-webkit-scrollbar-gutter:stable;-moz-scrollbar-gutter:stable;-ms-scrollbar-gutter:stable;scrollbar-gutter:stable;}/*!sc*/
@media screen and (max-width:768px){.gjtfVk{display:none;}}/*!sc*/
.cOxzdh{padding-left:16px;padding-right:16px;padding-bottom:8px;}/*!sc*/
.bTBnTW{height:100%;position:relative;display:none;margin-left:0;}/*!sc*/
.fFMzrG{position:absolute;inset:0 -2px;cursor:col-resize;background-color:transparent;-webkit-transition-delay:0.1s;transition-delay:0.1s;}/*!sc*/
.fFMzrG:hover{background-color:var(--bgColor-neutral-muted,var(--color-neutral-muted,rgba(110,118,129,0.4)));}/*!sc*/
.iKqMNA{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-order:2;-ms-flex-order:2;order:2;-webkit-flex-basis:0;-ms-flex-preferred-size:0;flex-basis:0;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;min-width:1px;margin-right:auto;}/*!sc*/
@media print{.iKqMNA{display:-webkit-box !important;display:-webkit-flex !important;display:-ms-flexbox !important;display:flex !important;}}/*!sc*/
.FxAyp{width:100%;max-width:100%;margin-left:auto;margin-right:auto;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;padding:0;}/*!sc*/
.leYMvG{margin-left:auto;margin-right:auto;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;padding-bottom:40px;max-width:100%;margin-top:0;}/*!sc*/
.KMPzq{display:inherit;}/*!sc*/
.hfKjHv{width:100%;}/*!sc*/
.gZWyZE{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;gap:8px;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;width:100%;}/*!sc*/
.dwYKDk{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:start;-webkit-box-align:start;-ms-flex-align:start;align-items:start;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;gap:8px;}/*!sc*/
.iDtIiT{-webkit-align-self:center;-ms-flex-item-align:center;align-self:center;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;padding-right:8px;min-width:0;}/*!sc*/
.cEytCf{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;font-size:16px;min-width:0;-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;max-width:100%;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.fzFXnm{max-width:100%;}/*!sc*/
.iMnkmv{max-width:100%;list-style:none;display:inline-block;}/*!sc*/
.ghzDag{display:inline-block;max-width:100%;}/*!sc*/
.kHuKdh{font-weight:600;}/*!sc*/
.jGhzSQ{font-weight:600;display:inline-block;max-width:100%;font-size:16px;}/*!sc*/
.faNtbn{min-height:32px;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:start;-webkit-box-align:start;-ms-flex-align:start;align-items:start;}/*!sc*/
.dwNhzn[data-size="medium"][data-no-visuals]{border-top-left-radius:0;border-bottom-left-radius:0;display:none;}/*!sc*/
.kVRliy[data-size="medium"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.dJxjrT{margin-left:16px;margin-right:16px;}/*!sc*/
.eFxKDQ{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;}/*!sc*/
.dzCJzi{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;gap:8px;min-width:273px;padding:8px;}/*!sc*/
@media screen and (min-width:544px){.dzCJzi{-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;}}/*!sc*/
.ldRxiI{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;}/*!sc*/
.efRoCL{width:100%;height:-webkit-fit-content;height:-moz-fit-content;height:fit-content;min-width:0;margin-right:16px;}/*!sc*/
.gNAmSV{height:40px;padding-left:4px;padding-bottom:16px;}/*!sc*/
.jNEwzY{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.ifyOQK{font-size:12px;-webkit-flex:auto;-ms-flex:auto;flex:auto;padding-right:16px;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));min-width:0;}/*!sc*/
.jdLMhu{top:0px;z-index:4;background:var(--bgColor-default,var(--color-canvas-default));position:-webkit-sticky;position:sticky;}/*!sc*/
.tOISc{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;width:100%;position:absolute;}/*!sc*/
.hqwSEx{display:none;min-width:0;padding-top:8px;padding-bottom:8px;}/*!sc*/
.lzKZY{margin-right:8px;margin-left:16px;text-overflow:ellipsis;overflow:hidden;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;width:100%;}/*!sc*/
.fHind{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;font-size:14px;min-width:0;-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;max-width:100%;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;}/*!sc*/
.dnZoUW{font-weight:600;display:inline-block;max-width:100%;font-size:14px;}/*!sc*/
.dpNnZU[data-size="small"]{color:var(--fgColor-default,var(--color-fg-default,#e6edf3));margin-left:8px;}/*!sc*/
.gpHFJV{padding-left:8px;padding-top:8px;padding-bottom:8px;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex:1;-ms-flex:1;flex:1;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;background-color:var(--bgColor-muted,var(--color-canvas-subtle,#161b22));border:1px solid var(--borderColor-default,var(--color-border-default));border-radius:6px 6px 0px 0px;}/*!sc*/
.iNMjfP{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;gap:8px;min-width:0;}/*!sc*/
.fefCSX{display:block;position:relative;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;margin-top:-1px;margin-bottom:-1px;--separator-color:transparent;}/*!sc*/
.fefCSX:not(:last-child){margin-right:1px;}/*!sc*/
.fefCSX:not(:last-child):after{background-color:var(--separator-color);content:"";position:absolute;right:-2px;top:8px;bottom:8px;width:1px;}/*!sc*/
.fefCSX:focus-within:has(:focus-visible){--separator-color:transparent;}/*!sc*/
.fefCSX:first-child{margin-left:-1px;}/*!sc*/
.fefCSX:last-child{margin-right:-1px;}/*!sc*/
.sulSy{display:block;position:relative;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;margin-top:-1px;margin-bottom:-1px;--separator-color:var(--borderColor-default,var(--color-border-default,#30363d));}/*!sc*/
.sulSy:not(:last-child){margin-right:1px;}/*!sc*/
.sulSy:not(:last-child):after{background-color:var(--separator-color);content:"";position:absolute;right:-2px;top:8px;bottom:8px;width:1px;}/*!sc*/
.sulSy:focus-within:has(:focus-visible){--separator-color:transparent;}/*!sc*/
.sulSy:first-child{margin-left:-1px;}/*!sc*/
.sulSy:last-child{margin-right:-1px;}/*!sc*/
.kcLCKF{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;gap:8px;margin-right:8px;}/*!sc*/
.kVWtTz{gap:8px;}/*!sc*/
.gWqxTd{padding-left:8px;padding-right:8px;}/*!sc*/
.gWqxTd linkButtonSx:hover:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.gWqxTd linkButtonSx:focus:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.gWqxTd linkButtonSx:active:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.ivobqY[data-size="small"][data-no-visuals]{border-top-left-radius:0;border-bottom-left-radius:0;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]{border-top-right-radius:0;border-bottom-right-radius:0;border-right-width:0;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]:hover:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]:focus:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.kilKoS[data-size="small"][data-no-visuals]:active:not([disabled]){-webkit-text-decoration:none;text-decoration:none;}/*!sc*/
.hySUEo[data-size="small"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));position:relative;}/*!sc*/
.itGLhU[data-size="small"][data-no-visuals]{color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.hycJXc{border:1px solid;border-top:none;border-color:var(--borderColor-default,var(--color-border-default,#30363d));border-radius:0px 0px 6px 6px;min-width:273px;}/*!sc*/
.dceWRL{background-color:var(--bgColor-default,var(--color-canvas-default));border:0px;border-width:0;border-radius:0px 0px 6px 6px;padding:0;min-width:0;margin-top:46px;}/*!sc*/
.dGXHv{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex:1;-ms-flex:1;flex:1;padding-top:8px;padding-bottom:8px;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;min-width:0;position:relative;}/*!sc*/
.bpDFns{position:relative;}/*!sc*/
.iJOeCH{-webkit-flex:1;-ms-flex:1;flex:1;position:relative;min-width:0;}/*!sc*/
.jewUnv{tab-size:8;isolation:isolate;position:relative;overflow:auto;max-width:unset;}/*!sc*/
.cJGaMs{margin:1px 8px;position:absolute;z-index:1;}/*!sc*/
.iGLarr{position:absolute;}/*!sc*/
.mgQhK{padding-bottom:33px;}/*!sc*/
.ipeRWy{background-color:var(--bgColor-default,var(--color-canvas-default,#0d1117));border:1px solid;border-color:var(--borderColor-default,var(--color-border-default,#30363d));border-radius:6px;contain:paint;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;height:100%;min-height:0;max-height:100vh;overflow-y:auto;right:0;top:0px;z-index:4;background:var(--bgColor-default,var(--color-canvas-default));position:-webkit-sticky;position:sticky;}/*!sc*/
.cxUsTr{padding-top:8px;padding-bottom:8px;padding-left:16px;padding-right:16px;}/*!sc*/
.jXkPPw{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;}/*!sc*/
.hECgeo{font-size:14px;-webkit-order:1;-ms-flex-order:1;order:1;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;font-weight:600;}/*!sc*/
.fotqAA[data-size="medium"][data-no-visuals]{-webkit-order:3;-ms-flex-order:3;order:3;color:var(--fgColor-default,var(--color-fg-default,#e6edf3));margin-right:-8px;}/*!sc*/
.hoyhab{font-size:12px;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));padding-top:8px;}/*!sc*/
.gqhZpQ{margin-right:6px;}/*!sc*/
.ccgkJf{margin-left:-16px;margin-bottom:-8px;}/*!sc*/
.kACRto{margin-bottom:-8px;overflow-y:auto;max-height:calc(100vh - 237px);padding-left:16px;padding-bottom:8px;padding-top:4px;}/*!sc*/
.cSURfY{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;}/*!sc*/
.bTXewe{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;position:relative;margin-right:8px;}/*!sc*/
.dotKsF{background-color:var(--color-prettylights-syntax-variable,#ffa657);opacity:0.1;position:absolute;border-radius:5px;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;height:100%;}/*!sc*/
.iGIwaf{color:var(--color-prettylights-syntax-variable,#ffa657);border-radius:5px;font-weight:600;font-size:smaller;padding-left:4px;padding-right:4px;padding-top:1px;padding-bottom:1px;}/*!sc*/
.gxAxAi{background-color:var(--color-prettylights-syntax-entity,#d2a8ff);opacity:0.1;position:absolute;border-radius:5px;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%;height:100%;}/*!sc*/
.gWkFIQ{color:var(--color-prettylights-syntax-entity,#d2a8ff);border-radius:5px;font-weight:600;font-size:smaller;padding-left:4px;padding-right:4px;padding-top:1px;padding-bottom:1px;}/*!sc*/
.cCoXib{position:fixed;top:0;right:0;height:100%;width:15px;-webkit-transition:-webkit-transform 0.3s;-webkit-transition:transform 0.3s;transition:transform 0.3s;z-index:1;}/*!sc*/
.cCoXib:hover{-webkit-transform:scaleX(1.5);-ms-transform:scaleX(1.5);transform:scaleX(1.5);}/*!sc*/
data-styled.g1[id="Box-sc-g0xbh4-0"]{content:"hOfjFo,oDGAe,kowOcT,gISSDQ,fHCyST,hPvFuC,fFSoPl,birIjn,hNNCwk,jfIeyl,XosP,hMLRgO,gUkoLg,kOkWgo,lhbroM,khzwtX,JMXqM,bZBlpz,bJjzmO,ffLUq,eTeVqd,jNHrPP,ijefGF,ftzGWg,dItACB,gjtfVk,cOxzdh,bTBnTW,fFMzrG,iKqMNA,FxAyp,leYMvG,KMPzq,hfKjHv,gZWyZE,dwYKDk,iDtIiT,cEytCf,fzFXnm,iMnkmv,ghzDag,kHuKdh,jGhzSQ,faNtbn,dwNhzn,kVRliy,dJxjrT,eFxKDQ,dzCJzi,ldRxiI,efRoCL,gNAmSV,jNEwzY,ifyOQK,jdLMhu,tOISc,hqwSEx,lzKZY,fHind,dnZoUW,dpNnZU,gpHFJV,iNMjfP,fefCSX,sulSy,kcLCKF,kVWtTz,gWqxTd,ivobqY,kilKoS,hySUEo,itGLhU,hycJXc,dceWRL,dGXHv,bpDFns,iJOeCH,jewUnv,cJGaMs,iGLarr,mgQhK,ipeRWy,cxUsTr,jXkPPw,hECgeo,fotqAA,hoyhab,gqhZpQ,ccgkJf,kACRto,cSURfY,bTXewe,dotKsF,iGIwaf,gxAxAi,gWkFIQ,cCoXib,"}/*!sc*/
.eMMFM{min-width:0;}/*!sc*/
.eMMFM:where([data-size='small']){font-size:var(--text-body-size-small,0.75rem);line-height:var(--text-body-lineHeight-small,1.6666);}/*!sc*/
.eMMFM:where([data-size='medium']){font-size:var(--text-body-size-medium,0.875rem);line-height:var(--text-body-lineHeight-medium,1.4285);}/*!sc*/
.eMMFM:where([data-size='large']){font-size:var(--text-body-size-large,1rem);line-height:var(--text-body-lineHeight-large,1.5);}/*!sc*/
.eMMFM:where([data-weight='light']){font-weight:var(--base-text-weight-light,300);}/*!sc*/
.eMMFM:where([data-weight='normal']){font-weight:var(--base-text-weight-normal,400);}/*!sc*/
.eMMFM:where([data-weight='medium']){font-weight:var(--base-text-weight-medium,500);}/*!sc*/
.eMMFM:where([data-weight='semibold']){font-weight:var(--base-text-weight-semibold,600);}/*!sc*/
.iHQnrN{padding-left:4px;padding-right:4px;font-weight:400;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));font-size:16px;}/*!sc*/
.iHQnrN:where([data-size='small']){font-size:var(--text-body-size-small,0.75rem);line-height:var(--text-body-lineHeight-small,1.6666);}/*!sc*/
.iHQnrN:where([data-size='medium']){font-size:var(--text-body-size-medium,0.875rem);line-height:var(--text-body-lineHeight-medium,1.4285);}/*!sc*/
.iHQnrN:where([data-size='large']){font-size:var(--text-body-size-large,1rem);line-height:var(--text-body-lineHeight-large,1.5);}/*!sc*/
.iHQnrN:where([data-weight='light']){font-weight:var(--base-text-weight-light,300);}/*!sc*/
.iHQnrN:where([data-weight='normal']){font-weight:var(--base-text-weight-normal,400);}/*!sc*/
.iHQnrN:where([data-weight='medium']){font-weight:var(--base-text-weight-medium,500);}/*!sc*/
.iHQnrN:where([data-weight='semibold']){font-weight:var(--base-text-weight-semibold,600);}/*!sc*/
.wcuBT{padding-left:4px;padding-right:4px;font-weight:400;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));font-size:14px;}/*!sc*/
.wcuBT:where([data-size='small']){font-size:var(--text-body-size-small,0.75rem);line-height:var(--text-body-lineHeight-small,1.6666);}/*!sc*/
.wcuBT:where([data-size='medium']){font-size:var(--text-body-size-medium,0.875rem);line-height:var(--text-body-lineHeight-medium,1.4285);}/*!sc*/
.wcuBT:where([data-size='large']){font-size:var(--text-body-size-large,1rem);line-height:var(--text-body-lineHeight-large,1.5);}/*!sc*/
.wcuBT:where([data-weight='light']){font-weight:var(--base-text-weight-light,300);}/*!sc*/
.wcuBT:where([data-weight='normal']){font-weight:var(--base-text-weight-normal,400);}/*!sc*/
.wcuBT:where([data-weight='medium']){font-weight:var(--base-text-weight-medium,500);}/*!sc*/
.wcuBT:where([data-weight='semibold']){font-weight:var(--base-text-weight-semibold,600);}/*!sc*/
data-styled.g5[id="Text__StyledText-sc-17v1xeu-0"]{content:"eMMFM,iHQnrN,wcuBT,"}/*!sc*/
.brGdpi{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;-webkit-clip:rect(0,0,0,0);clip:rect(0,0,0,0);white-space:nowrap;border-width:0;}/*!sc*/
data-styled.g6[id="_VisuallyHidden__VisuallyHidden-sc-11jhm7a-0"]{content:"brGdpi,"}/*!sc*/
.jkNcAv{border:0;font-size:inherit;font-family:inherit;background-color:transparent;-webkit-appearance:none;color:inherit;width:100%;}/*!sc*/
.jkNcAv:focus{outline:0;}/*!sc*/
data-styled.g13[id="UnstyledTextInput__ToggledUnstyledTextInput-sc-14ypya-0"]{content:"jkNcAv,"}/*!sc*/
.bclhiL{font-size:14px;line-height:var(--base-size-20);color:var(--fgColor-default,var(--color-fg-default,#e6edf3));vertical-align:middle;background-color:var(--bgColor-default,var(--color-canvas-default,#0d1117));border:1px solid var(--control-borderColor-rest,var(--borderColor-default,var(--color-border-default,#30363d)));border-radius:6px;outline:none;box-shadow:var(--shadow-inset,var(--color-primer-shadow-inset,0 0 transparent));display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;min-height:var(--base-size-32);overflow:hidden;--inner-action-size:var(--base-size-24);}/*!sc*/
.bclhiL input,.bclhiL textarea{cursor:text;}/*!sc*/
.bclhiL select{cursor:pointer;}/*!sc*/
.bclhiL input::-webkit-input-placeholder,.bclhiL textarea::-webkit-input-placeholder,.bclhiL select::-webkit-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL input::-moz-placeholder,.bclhiL textarea::-moz-placeholder,.bclhiL select::-moz-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL input:-ms-input-placeholder,.bclhiL textarea:-ms-input-placeholder,.bclhiL select:-ms-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL input::placeholder,.bclhiL textarea::placeholder,.bclhiL select::placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.bclhiL:where([data-trailing-action][data-focused]),.bclhiL:where(:not([data-trailing-action]):focus-within){border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.bclhiL > textarea{padding:var(--base-size-12);}/*!sc*/
.bclhiL:where([data-contrast]){background-color:var(--bgColor-inset,var(--color-canvas-inset,#010409));}/*!sc*/
.bclhiL:where([data-disabled]){color:var(--fgColor-disabled,var(--color-primer-fg-disabled,#484f58));background-color:var(--control-bgColor-disabled,var(--color-input-disabled-bg,rgba(110,118,129,0)));box-shadow:none;border-color:var(--control-borderColor-disabled,var(--borderColor-default,var(--color-border-default,#30363d)));}/*!sc*/
.bclhiL:where([data-disabled]) input,.bclhiL:where([data-disabled]) textarea,.bclhiL:where([data-disabled]) select{cursor:not-allowed;}/*!sc*/
.bclhiL:where([data-monospace]){font-family:var(--fontStack-monospace,SFMono-Regular,Consolas,"Liberation Mono",Menlo,Courier,monospace);}/*!sc*/
.bclhiL:where([data-validation='error']){border-color:var(--borderColor-danger-emphasis,var(--color-danger-emphasis,#da3633));}/*!sc*/
.bclhiL:where([data-validation='error']):where([data-trailing-action][data-focused]),.bclhiL:where([data-validation='error']):where(:not([data-trailing-action])):focus-within{border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.bclhiL:where([data-validation='success']){border-color:var(--bgColor-success-emphasis,var(--color-success-emphasis,#238636));}/*!sc*/
.bclhiL:where([data-block]){width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-self:stretch;-ms-flex-item-align:stretch;align-self:stretch;}/*!sc*/
@media (min-width:768px){.bclhiL{font-size:var(--text-body-size-medium);}}/*!sc*/
.bclhiL:where([data-size='small']){--inner-action-size:var(--base-size-20);min-height:var(--base-size-28);padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:var(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.bclhiL:where([data-size='large']){--inner-action-size:var(--base-size-28);height:var(--base-size-40);padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);}/*!sc*/
.bclhiL:where([data-variant='small']){min-height:28px;padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.bclhiL:where([data-variant='large']){padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);font-size:var(--text-title-size-medium);}/*!sc*/
.bclhiL{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;min-width:160px;}/*!sc*/
.zEBjf{font-size:14px;line-height:var(--base-size-20);color:var(--fgColor-default,var(--color-fg-default,#e6edf3));vertical-align:middle;background-color:var(--bgColor-default,var(--color-canvas-default,#0d1117));border:1px solid var(--control-borderColor-rest,var(--borderColor-default,var(--color-border-default,#30363d)));border-radius:6px;outline:none;box-shadow:var(--shadow-inset,var(--color-primer-shadow-inset,0 0 transparent));display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;-webkit-align-items:stretch;-webkit-box-align:stretch;-ms-flex-align:stretch;align-items:stretch;min-height:var(--base-size-32);overflow:hidden;--inner-action-size:var(--base-size-24);}/*!sc*/
.zEBjf input,.zEBjf textarea{cursor:text;}/*!sc*/
.zEBjf select{cursor:pointer;}/*!sc*/
.zEBjf input::-webkit-input-placeholder,.zEBjf textarea::-webkit-input-placeholder,.zEBjf select::-webkit-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf input::-moz-placeholder,.zEBjf textarea::-moz-placeholder,.zEBjf select::-moz-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf input:-ms-input-placeholder,.zEBjf textarea:-ms-input-placeholder,.zEBjf select:-ms-input-placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf input::placeholder,.zEBjf textarea::placeholder,.zEBjf select::placeholder{color:var(---control-fgColor-placeholder,var(--fgColor-muted,var(--color-fg-muted,#848d97)));}/*!sc*/
.zEBjf:where([data-trailing-action][data-focused]),.zEBjf:where(:not([data-trailing-action]):focus-within){border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.zEBjf > textarea{padding:var(--base-size-12);}/*!sc*/
.zEBjf:where([data-contrast]){background-color:var(--bgColor-inset,var(--color-canvas-inset,#010409));}/*!sc*/
.zEBjf:where([data-disabled]){color:var(--fgColor-disabled,var(--color-primer-fg-disabled,#484f58));background-color:var(--control-bgColor-disabled,var(--color-input-disabled-bg,rgba(110,118,129,0)));box-shadow:none;border-color:var(--control-borderColor-disabled,var(--borderColor-default,var(--color-border-default,#30363d)));}/*!sc*/
.zEBjf:where([data-disabled]) input,.zEBjf:where([data-disabled]) textarea,.zEBjf:where([data-disabled]) select{cursor:not-allowed;}/*!sc*/
.zEBjf:where([data-monospace]){font-family:var(--fontStack-monospace,SFMono-Regular,Consolas,"Liberation Mono",Menlo,Courier,monospace);}/*!sc*/
.zEBjf:where([data-validation='error']){border-color:var(--borderColor-danger-emphasis,var(--color-danger-emphasis,#da3633));}/*!sc*/
.zEBjf:where([data-validation='error']):where([data-trailing-action][data-focused]),.zEBjf:where([data-validation='error']):where(:not([data-trailing-action])):focus-within{border-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.zEBjf:where([data-validation='success']){border-color:var(--bgColor-success-emphasis,var(--color-success-emphasis,#238636));}/*!sc*/
.zEBjf:where([data-block]){width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-self:stretch;-ms-flex-item-align:stretch;align-self:stretch;}/*!sc*/
@media (min-width:768px){.zEBjf{font-size:var(--text-body-size-medium);}}/*!sc*/
.zEBjf:where([data-size='small']){--inner-action-size:var(--base-size-20);min-height:var(--base-size-28);padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:var(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.zEBjf:where([data-size='large']){--inner-action-size:var(--base-size-28);height:var(--base-size-40);padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);}/*!sc*/
.zEBjf:where([data-variant='small']){min-height:28px;padding-top:3px;padding-right:var(--base-size-8);padding-bottom:3px;padding-left:var(--base-size-8);font-size:(--text-body-size-small);line-height:var(--base-size-20);}/*!sc*/
.zEBjf:where([data-variant='large']){padding-top:10px;padding-right:var(--base-size-8);padding-bottom:10px;padding-left:var(--base-size-8);font-size:var(--text-title-size-medium);}/*!sc*/
.zEBjf{margin-top:8px;border-radius:6px;}/*!sc*/
data-styled.g14[id="TextInputWrapper__StyledTextInputBaseWrapper-sc-1mqhpbi-0"]{content:"bclhiL,zEBjf,"}/*!sc*/
.jpROxA{background-repeat:no-repeat;background-position:right 8px center;padding-right:0;padding-left:0;}/*!sc*/
.jpROxA > :not(:last-child){margin-right:8px;}/*!sc*/
.jpROxA .TextInput-icon,.jpROxA .TextInput-action{-webkit-align-self:center;-ms-flex-item-align:center;align-self:center;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;}/*!sc*/
.jpROxA > input,.jpROxA > select{padding-right:0;padding-left:0;}/*!sc*/
.jpROxA:where([data-leading-visual]){padding-left:var(--base-size-12);}/*!sc*/
.jpROxA:where([data-trailing-visual]:not([data-trailing-action])){padding-right:var(--base-size-12);}/*!sc*/
.jpROxA:where(:not([data-leading-visual])) > input,.jpROxA:where(:not([data-leading-visual])) > select{padding-left:var(--base-size-12);}/*!sc*/
.jpROxA:where(:not([data-trailing-visual]):not([data-trailing-action])) > input,.jpROxA:where(:not([data-trailing-visual]):not([data-trailing-action])) > select{padding-right:var(--base-size-12);}/*!sc*/
.jpROxA{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;min-width:160px;}/*!sc*/
.kOcqDw{background-repeat:no-repeat;background-position:right 8px center;padding-right:0;padding-left:0;}/*!sc*/
.kOcqDw > :not(:last-child){margin-right:8px;}/*!sc*/
.kOcqDw .TextInput-icon,.kOcqDw .TextInput-action{-webkit-align-self:center;-ms-flex-item-align:center;align-self:center;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;}/*!sc*/
.kOcqDw > input,.kOcqDw > select{padding-right:0;padding-left:0;}/*!sc*/
.kOcqDw:where([data-leading-visual]){padding-left:var(--base-size-12);}/*!sc*/
.kOcqDw:where([data-trailing-visual]:not([data-trailing-action])){padding-right:var(--base-size-12);}/*!sc*/
.kOcqDw:where(:not([data-leading-visual])) > input,.kOcqDw:where(:not([data-leading-visual])) > select{padding-left:var(--base-size-12);}/*!sc*/
.kOcqDw:where(:not([data-trailing-visual]):not([data-trailing-action])) > input,.kOcqDw:where(:not([data-trailing-visual]):not([data-trailing-action])) > select{padding-right:var(--base-size-12);}/*!sc*/
.kOcqDw{margin-top:8px;border-radius:6px;}/*!sc*/
data-styled.g15[id="TextInputWrapper__StyledTextInputWrapper-sc-1mqhpbi-1"]{content:"jpROxA,kOcqDw,"}/*!sc*/
.fLAhLl{display:none;}/*!sc*/
.fLAhLl[popover]{position:absolute;padding:0.5em 0.75em;width:-webkit-max-content;width:-moz-max-content;width:max-content;margin:auto;-webkit-clip:auto;clip:auto;white-space:normal;font:normal normal 11px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";-webkit-font-smoothing:subpixel-antialiased;color:var(--tooltip-fgColor,var(--fgColor-onEmphasis,var(--color-fg-on-emphasis,#ffffff)));text-align:center;word-wrap:break-word;background:var(--tooltip-bgColor,var(--bgColor-emphasis,var(--color-neutral-emphasis-plus,#6e7681)));border-radius:6px;border:0;opacity:0;max-width:250px;inset:auto;overflow:visible;}/*!sc*/
.fLAhLl[popover]:popover-open{display:block;}/*!sc*/
.fLAhLl[popover].\:popover-open{display:block;}/*!sc*/
@media (forced-colors:active){.fLAhLl{outline:1px solid transparent;}}/*!sc*/
.fLAhLl::after{position:absolute;display:block;right:0;left:0;height:var(--overlay-offset,0.25rem);content:'';}/*!sc*/
.fLAhLl[data-direction='n']::after,.fLAhLl[data-direction='ne']::after,.fLAhLl[data-direction='nw']::after{top:100%;}/*!sc*/
.fLAhLl[data-direction='s']::after,.fLAhLl[data-direction='se']::after,.fLAhLl[data-direction='sw']::after{bottom:100%;}/*!sc*/
.fLAhLl[data-direction='w']::after{position:absolute;display:block;height:100%;width:8px;content:'';bottom:0;left:100%;}/*!sc*/
.fLAhLl[data-direction='e']::after{position:absolute;display:block;height:100%;width:8px;content:'';bottom:0;right:100%;margin-left:-8px;}/*!sc*/
@-webkit-keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
@keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
.fLAhLl:popover-open,.fLAhLl:popover-open::before{-webkit-animation-name:tooltip-appear;animation-name:tooltip-appear;-webkit-animation-duration:0.1s;animation-duration:0.1s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in;-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
.fLAhLl.\:popover-open,.fLAhLl.\:popover-open::before{-webkit-animation-name:tooltip-appear;animation-name:tooltip-appear;-webkit-animation-duration:0.1s;animation-duration:0.1s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in;-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
data-styled.g16[id="Tooltip__StyledTooltip-sc-e45c7z-0"]{content:"fLAhLl,"}/*!sc*/
.fiSvBN{position:relative;display:inline-block;}/*!sc*/
.fiSvBN::after{position:absolute;z-index:1000000;display:none;padding:0.5em 0.75em;font:normal normal 11px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";-webkit-font-smoothing:subpixel-antialiased;color:var(--tooltip-fgColor,var(--fgColor-onEmphasis,var(--color-fg-on-emphasis,#ffffff)));text-align:center;-webkit-text-decoration:none;text-decoration:none;text-shadow:none;text-transform:none;-webkit-letter-spacing:normal;-moz-letter-spacing:normal;-ms-letter-spacing:normal;letter-spacing:normal;word-wrap:break-word;white-space:pre;pointer-events:none;content:attr(aria-label);background:var(--tooltip-bgColor,var(--bgColor-emphasis,var(--color-neutral-emphasis-plus,#6e7681)));border-radius:6px;opacity:0;}/*!sc*/
@-webkit-keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
@keyframes tooltip-appear{from{opacity:0;}to{opacity:1;}}/*!sc*/
.fiSvBN:hover::after,.fiSvBN:active::after,.fiSvBN:focus::after,.fiSvBN:focus-within::after{display:inline-block;-webkit-text-decoration:none;text-decoration:none;-webkit-animation-name:tooltip-appear;animation-name:tooltip-appear;-webkit-animation-duration:0.1s;animation-duration:0.1s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in;-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
.fiSvBN.tooltipped-no-delay:hover::after,.fiSvBN.tooltipped-no-delay:active::after,.fiSvBN.tooltipped-no-delay:focus::after,.fiSvBN.tooltipped-no-delay:focus-within::after{-webkit-animation-delay:0s;animation-delay:0s;}/*!sc*/
.fiSvBN.tooltipped-multiline:hover::after,.fiSvBN.tooltipped-multiline:active::after,.fiSvBN.tooltipped-multiline:focus::after,.fiSvBN.tooltipped-multiline:focus-within::after{display:table-cell;}/*!sc*/
.fiSvBN.tooltipped-s::after,.fiSvBN.tooltipped-se::after,.fiSvBN.tooltipped-sw::after{top:100%;right:50%;margin-top:6px;}/*!sc*/
.fiSvBN.tooltipped-se::after{right:auto;left:50%;margin-left:-16px;}/*!sc*/
.fiSvBN.tooltipped-sw::after{margin-right:-16px;}/*!sc*/
.fiSvBN.tooltipped-n::after,.fiSvBN.tooltipped-ne::after,.fiSvBN.tooltipped-nw::after{right:50%;bottom:100%;margin-bottom:6px;}/*!sc*/
.fiSvBN.tooltipped-ne::after{right:auto;left:50%;margin-left:-16px;}/*!sc*/
.fiSvBN.tooltipped-nw::after{margin-right:-16px;}/*!sc*/
.fiSvBN.tooltipped-s::after,.fiSvBN.tooltipped-n::after{-webkit-transform:translateX(50%);-ms-transform:translateX(50%);transform:translateX(50%);}/*!sc*/
.fiSvBN.tooltipped-w::after{right:100%;bottom:50%;margin-right:6px;-webkit-transform:translateY(50%);-ms-transform:translateY(50%);transform:translateY(50%);}/*!sc*/
.fiSvBN.tooltipped-e::after{bottom:50%;left:100%;margin-left:6px;-webkit-transform:translateY(50%);-ms-transform:translateY(50%);transform:translateY(50%);}/*!sc*/
.fiSvBN.tooltipped-multiline::after{width:-webkit-max-content;width:-moz-max-content;width:max-content;max-width:250px;word-wrap:break-word;white-space:pre-line;border-collapse:separate;}/*!sc*/
.fiSvBN.tooltipped-multiline.tooltipped-s::after,.fiSvBN.tooltipped-multiline.tooltipped-n::after{right:auto;left:50%;-webkit-transform:translateX(-50%);-ms-transform:translateX(-50%);transform:translateX(-50%);}/*!sc*/
.fiSvBN.tooltipped-multiline.tooltipped-w::after,.fiSvBN.tooltipped-multiline.tooltipped-e::after{right:100%;}/*!sc*/
.fiSvBN.tooltipped-align-right-2::after{right:0;margin-right:0;}/*!sc*/
.fiSvBN.tooltipped-align-left-2::after{left:0;margin-left:0;}/*!sc*/
data-styled.g17[id="Tooltip__TooltipBase-sc-17tf59c-0"]{content:"fiSvBN,"}/*!sc*/
.eAtkQz{display:inline-block;overflow:hidden;text-overflow:ellipsis;vertical-align:top;white-space:nowrap;max-width:125px;max-width:100%;}/*!sc*/
.btDQYJ{display:inherit;overflow:hidden;text-overflow:ellipsis;vertical-align:initial;white-space:nowrap;max-width:125px;max-width:180px;display:block;}/*!sc*/
data-styled.g19[id="Truncate__StyledTruncate-sc-23o1d2-0"]{content:"eAtkQz,btDQYJ,"}/*!sc*/
.kQyrwv{--segmented-control-button-inner-padding:12px;--segmented-control-button-bg-inset:4px;--segmented-control-outer-radius:6px;background-color:transparent;border-color:transparent;border-radius:var(--segmented-control-outer-radius);border-width:0;color:currentColor;cursor:pointer;font-family:inherit;font-size:inherit;font-weight:600;padding:0;height:100%;width:100%;}/*!sc*/
.kQyrwv:focus:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.kQyrwv:focus:not(:disabled):not(:focus-visible){outline:solid 1px transparent;}/*!sc*/
.kQyrwv:focus-visible:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.kQyrwv .segmentedControl-content{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;background-color:var(--controlKnob-bgColor-rest,var(--color-segmented-control-button-bg,#0d1117));border-color:var(--controlKnob-borderColor-rest,var(--color-segmented-control-button-selected-border,#6e7681));border-style:solid;border-width:1px;border-radius:var(--segmented-control-outer-radius);display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;height:100%;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;padding-left:var(--segmented-control-button-inner-padding);padding-right:var(--segmented-control-button-inner-padding);}/*!sc*/
.kQyrwv svg{fill:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.kQyrwv:focus:focus-visible:not(:last-child):after{width:0;}/*!sc*/
.kQyrwv .segmentedControl-text:after{content:"Code";display:block;font-weight:600;height:0;overflow:hidden;pointer-events:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;visibility:hidden;}/*!sc*/
@media (pointer:coarse){.kQyrwv:before{content:"";position:absolute;left:0;right:0;-webkit-transform:translateY(-50%);-ms-transform:translateY(-50%);transform:translateY(-50%);top:50%;min-height:44px;}}/*!sc*/
.gKyOFO{--segmented-control-button-inner-padding:12px;--segmented-control-button-bg-inset:4px;--segmented-control-outer-radius:6px;background-color:transparent;border-color:transparent;border-radius:var(--segmented-control-outer-radius);border-width:0;color:currentColor;cursor:pointer;font-family:inherit;font-size:inherit;font-weight:400;padding:var(--segmented-control-button-bg-inset);height:100%;width:100%;}/*!sc*/
.gKyOFO:focus:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.gKyOFO:focus:not(:disabled):not(:focus-visible){outline:solid 1px transparent;}/*!sc*/
.gKyOFO:focus-visible:not(:disabled){box-shadow:none;outline:2px solid var(--fgColor-accent,var(--color-accent-fg,#2f81f7));outline-offset:-1px;}/*!sc*/
.gKyOFO .segmentedControl-content{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;background-color:transparent;border-color:transparent;border-style:solid;border-width:1px;border-radius:calc(var(--segmented-control-outer-radius) - var(--segmented-control-button-bg-inset) / 2);display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;height:100%;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;padding-left:calc(var(--segmented-control-button-inner-padding) - var(--segmented-control-button-bg-inset));padding-right:calc(var(--segmented-control-button-inner-padding) - var(--segmented-control-button-bg-inset));}/*!sc*/
.gKyOFO svg{fill:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.gKyOFO:hover .segmentedControl-content{background-color:var(--controlTrack-bgColor-hover,var(--color-segmented-control-button-hover-bg,#30363d));}/*!sc*/
.gKyOFO:active .segmentedControl-content{background-color:var(--controlTrack-bgColor-active,var(--color-segmented-control-button-active-bg,#21262d));}/*!sc*/
.gKyOFO:focus:focus-visible:not(:last-child):after{width:0;}/*!sc*/
.gKyOFO .segmentedControl-text:after{content:"Blame";display:block;font-weight:600;height:0;overflow:hidden;pointer-events:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;visibility:hidden;}/*!sc*/
@media (pointer:coarse){.gKyOFO:before{content:"";position:absolute;left:0;right:0;-webkit-transform:translateY(-50%);-ms-transform:translateY(-50%);transform:translateY(-50%);top:50%;min-height:44px;}}/*!sc*/
data-styled.g105[id="SegmentedControlButton__SegmentedControlButtonStyled-sc-8lkgxl-0"]{content:"kQyrwv,gKyOFO,"}/*!sc*/
.eYPFoP{background-color:var(--controlTrack-bgColor-rest,var(--color-segmented-control-bg,rgba(110,118,129,0.1)));border-radius:6px;border:1px solid;border-color:var(--controlTrack-borderColor-rest,transparent);display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;font-size:14px;height:28px;margin:0;padding:0;}/*!sc*/
data-styled.g107[id="SegmentedControl__SegmentedControlList-sc-1rzig82-0"]{content:"eYPFoP,"}/*!sc*/
body[data-page-layout-dragging="true"]{cursor:col-resize;}/*!sc*/
body[data-page-layout-dragging="true"] *{-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;}/*!sc*/
data-styled.g108[id="sc-global-gbKrvU1"]{content:"sc-global-gbKrvU1,"}/*!sc*/
.cJWUiG{list-style:none;padding:0;margin:0;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item{outline:none;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item:focus-visible > div,.cJWUiG .PRIVATE_TreeView-item.focus-visible > div{box-shadow:inset 0 0 0 2px var(--fgColor-accent,var(--color-accent-fg,#2f81f7));}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item:focus-visible > div,.cJWUiG .PRIVATE_TreeView-item.focus-visible > div{outline:2px solid HighlightText;outline-offset:-2;}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item[data-has-leading-action]{--has-leading-action:1;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-container{--level:1;--toggle-width:1rem;--min-item-height:2rem;position:relative;display:grid;--leading-action-width:calc(var(--has-leading-action,0) * 1.5rem);--spacer-width:calc(calc(var(--level) - 1) * (var(--toggle-width) / 2));grid-template-columns:var(--spacer-width) var(--leading-action-width) var(--toggle-width) 1fr;grid-template-areas:'spacer leadingAction toggle content';width:100%;font-size:14px;color:var(--fgColor-default,var(--color-fg-default,#e6edf3));border-radius:6px;cursor:pointer;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-container:hover{background-color:var(--control-transparent-bgColor-hover,var(--color-action-list-item-default-hover-bg,rgba(177,186,196,0.12)));}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item-container:hover{outline:2px solid transparent;outline-offset:-2px;}}/*!sc*/
@media (pointer:coarse){.cJWUiG .PRIVATE_TreeView-item-container{--toggle-width:1.5rem;--min-item-height:2.75rem;}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-container:has(.PRIVATE_TreeView-item-skeleton):hover{background-color:transparent;cursor:default;}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item-container:has(.PRIVATE_TreeView-item-skeleton):hover{outline:none;}}/*!sc*/
.cJWUiG[data-omit-spacer='true'] .PRIVATE_TreeView-item-container{grid-template-columns:0 0 0 1fr;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item[aria-current='true'] > .PRIVATE_TreeView-item-container{background-color:var(--control-transparent-bgColor-selected,var(--color-action-list-item-default-selected-bg,rgba(177,186,196,0.08)));}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item[aria-current='true'] > .PRIVATE_TreeView-item-container::after{content:'';position:absolute;top:calc(50% - 0.75rem);left:-8px;width:0.25rem;height:1.5rem;background-color:var(--fgColor-accent,var(--color-accent-fg,#2f81f7));border-radius:6px;}/*!sc*/
@media (forced-colors:active){.cJWUiG .PRIVATE_TreeView-item[aria-current='true'] > .PRIVATE_TreeView-item-container::after{background-color:HighlightText;}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-toggle{grid-area:toggle;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;-webkit-align-items:flex-start;-webkit-box-align:flex-start;-ms-flex-align:flex-start;align-items:flex-start;padding-top:calc(var(--min-item-height) / 2 - 12px / 2);height:100%;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-toggle--hover:hover{background-color:var(--control-transparent-bgColor-hover,var(--color-tree-view-item-chevron-hover-bg,rgba(177,186,196,0.12)));}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-toggle--end{border-top-left-radius:6px;border-bottom-left-radius:6px;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-content{grid-area:content;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;height:100%;padding:0 8px;gap:8px;line-height:var(--custom-line-height,var(--text-body-lineHeight-medium,1.4285));padding-top:calc((var(--min-item-height) - var(--custom-line-height,1.3rem)) / 2);padding-bottom:calc((var(--min-item-height) - var(--custom-line-height,1.3rem)) / 2);}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-content-text{-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;width:0;}/*!sc*/
.cJWUiG[data-truncate-text='true'] .PRIVATE_TreeView-item-content-text{overflow:hidden;white-space:nowrap;text-overflow:ellipsis;}/*!sc*/
.cJWUiG[data-truncate-text='false'] .PRIVATE_TreeView-item-content-text{word-break:break-word;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-visual{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));height:var(--custom-line-height,1.3rem);}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-leading-action{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;color:var(--fgColor-muted,var(--color-fg-muted,#848d97));grid-area:leadingAction;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-leading-action > button{-webkit-flex-shrink:1;-ms-flex-negative:1;flex-shrink:1;}/*!sc*/
.cJWUiG .PRIVATE_TreeView-item-level-line{width:100%;height:100%;border-right:1px solid;border-color:var(--borderColor-muted,var(--color-border-subtle,rgba(240,246,252,0.1)));}/*!sc*/
@media (hover:hover){.cJWUiG .PRIVATE_TreeView-item-level-line{border-color:transparent;}.cJWUiG:hover .PRIVATE_TreeView-item-level-line,.cJWUiG:focus-within .PRIVATE_TreeView-item-level-line{border-color:var(--borderColor-muted,var(--color-border-subtle,rgba(240,246,252,0.1)));}}/*!sc*/
.cJWUiG .PRIVATE_TreeView-directory-icon{display:grid;color:var(--treeViewItem-leadingVisual-iconColor-rest,var(--color-tree-view-item-directory-fill,#848d97));}/*!sc*/
.cJWUiG .PRIVATE_VisuallyHidden{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;-webkit-clip:rect(0,0,0,0);clip:rect(0,0,0,0);white-space:nowrap;border-width:0;}/*!sc*/
data-styled.g114[id="TreeView__UlBox-sc-4ex6b6-0"]{content:"cJWUiG,"}/*!sc*/
</style><meta data-hydrostats="publish"/> <!-- --> <!-- --> <button hidden="" data-testid="header-permalink-button" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><div><div style="--sticky-pane-height:100vh;--spacing:var(--spacing-none)" class="Box-sc-g0xbh4-0 hOfjFo"><div class="Box-sc-g0xbh4-0 oDGAe"><div class="Box-sc-g0xbh4-0 kowOcT"><div tabindex="0" class="Box-sc-g0xbh4-0 gISSDQ"><div class="Box-sc-g0xbh4-0 fHCyST"><div class="Box-sc-g0xbh4-0 hPvFuC"></div><div style="--pane-width:320px" class="Box-sc-g0xbh4-0 fFSoPl"><div class="react-tree-pane-contents-3-panel"><div id="repos-file-tree" class="Box-sc-g0xbh4-0 birIjn"><div class="Box-sc-g0xbh4-0 hNNCwk"><div class="Box-sc-g0xbh4-0 jfIeyl"><h2 class="Box-sc-g0xbh4-0 XosP prc-Heading-Heading-6CmGO"><button style="--button-color:fg.muted" type="button" aria-label="Expand file tree" data-testid="expand-file-tree-button-mobile" class="Box-sc-g0xbh4-0 hMLRgO prc-Button-ButtonBase-c50BI" data-loading="false" data-size="medium" data-variant="invisible" aria-describedby=":Rl6mplab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-arrow-left" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M7.78 12.53a.75.75 0 0 1-1.06 0L2.47 8.28a.75.75 0 0 1 0-1.06l4.25-4.25a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042L4.81 7h7.44a.75.75 0 0 1 0 1.5H4.81l2.97 2.97a.75.75 0 0 1 0 1.06Z"></path></svg></span><span data-component="text" class="prc-Button-Label-pTQ3x">Files</span></span></button><span role="tooltip" aria-label="Collapse file tree" id="expand-button-file-tree-button" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-se"><button data-component="IconButton" type="button" data-testid="collapse-file-tree-button" aria-expanded="true" aria-controls="repos-file-tree" class="prc-Button-ButtonBase-c50BI position-relative ExpandFileTreeButton-module__expandButton--gL4is ExpandFileTreeButton-module__filesButtonBreakpoint--WfX9t fgColor-muted prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="invisible" aria-describedby=":R356mplab:-loading-announcement" aria-labelledby="expand-button-file-tree-button"><svg aria-hidden="true" focusable="false" class="octicon octicon-sidebar-expand" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m4.177 7.823 2.396-2.396A.25.25 0 0 1 7 5.604v4.792a.25.25 0 0 1-.427.177L4.177 8.177a.25.25 0 0 1 0-.354Z"></path><path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25H9.5v-13Zm12.5 13a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25H11v13Z"></path></svg></button></span><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button></h2><h2 class="Box-sc-g0xbh4-0 kOkWgo prc-Heading-Heading-6CmGO">Files</h2></div><div class="Box-sc-g0xbh4-0 lhbroM"><div class="Box-sc-g0xbh4-0 khzwtX"><button type="button" aria-haspopup="true" aria-expanded="false" tabindex="0" aria-label="main branch" data-testid="anchor-button" class="Box-sc-g0xbh4-0 JMXqM prc-Button-ButtonBase-c50BI react-repos-tree-pane-ref-selector width-full ref-selector-class" data-loading="false" data-size="medium" data-variant="default" aria-describedby="branch-picker-repos-header-ref-selector-loading-announcement" id="branch-picker-repos-header-ref-selector"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x"><div class="Box-sc-g0xbh4-0 bZBlpz"><div class="Box-sc-g0xbh4-0 bJjzmO"><svg aria-hidden="true" focusable="false" class="octicon octicon-git-branch" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M9.5 3.25a2.25 2.25 0 1 1 3 2.122V6A2.5 2.5 0 0 1 10 8.5H6a1 1 0 0 0-1 1v1.128a2.251 2.251 0 1 1-1.5 0V5.372a2.25 2.25 0 1 1 1.5 0v1.836A2.493 2.493 0 0 1 6 7h4a1 1 0 0 0 1-1v-.628A2.25 2.25 0 0 1 9.5 3.25Zm-6 0a.75.75 0 1 0 1.5 0 .75.75 0 0 0-1.5 0Zm8.25-.75a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5ZM4.25 12a.75.75 0 1 0 0 1.5.75.75 0 0 0 0-1.5Z"></path></svg></div><div class="Box-sc-g0xbh4-0 ffLUq ref-selector-button-text-container"><span class="Text__StyledText-sc-17v1xeu-0 eMMFM"> <!-- -->main</span></div></div></span><span data-component="trailingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-triangle-down" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path></svg></span></span></button><button hidden="" data-hotkey-scope="read-only-cursor-text-area"></button></div><div class="Box-sc-g0xbh4-0 eTeVqd"><a sx="[object Object]" data-component="IconButton" type="button" aria-label="Add file" class="Box-sc-g0xbh4-0 jNHrPP prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R6q6mplab:-loading-announcement :Rq6mplab:" href="/Alimiji/Solr_utilisation/new/main"><svg aria-hidden="true" focusable="false" class="octicon octicon-plus" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M7.75 2a.75.75 0 0 1 .75.75V7h4.25a.75.75 0 0 1 0 1.5H8.5v4.25a.75.75 0 0 1-1.5 0V8.5H2.75a.75.75 0 0 1 0-1.5H7V2.75A.75.75 0 0 1 7.75 2Z"></path></svg></a><span class="Tooltip__StyledTooltip-sc-e45c7z-0 fLAhLl" data-direction="s" aria-label="Add file" role="tooltip" aria-hidden="true" id=":Rq6mplab:">Add file</span><button data-component="IconButton" type="button" aria-label="Search this repository" class="Box-sc-g0xbh4-0 ijefGF prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R3a6mplab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-search" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path></svg></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button></div></div></div><div class="Box-sc-g0xbh4-0 ftzGWg"><span class="TextInputWrapper__StyledTextInputBaseWrapper-sc-1mqhpbi-0 bclhiL TextInputWrapper__StyledTextInputWrapper-sc-1mqhpbi-1 jpROxA TextInput-wrapper" data-leading-visual="true" data-trailing-visual="true" aria-busy="false"><span class="TextInput-icon" id=":R5amplab:" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-search" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M10.68 11.74a6 6 0 0 1-7.922-8.982 6 6 0 0 1 8.982 7.922l3.04 3.04a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215ZM11.5 7a4.499 4.499 0 1 0-8.997 0A4.499 4.499 0 0 0 11.5 7Z"></path></svg></span><input type="text" aria-label="Go to file" role="combobox" aria-controls="file-results-list" aria-expanded="false" aria-haspopup="dialog" autoCorrect="off" spellcheck="false" placeholder="Go to file" aria-describedby=":R5amplab: :R5amplabH1:" data-component="input" class="UnstyledTextInput__ToggledUnstyledTextInput-sc-14ypya-0 jkNcAv" value=""/><span class="TextInput-icon" id=":R5amplabH1:" aria-hidden="true"><div class="Box-sc-g0xbh4-0 dItACB"><kbd>t</kbd></div></span></span></div><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><div class="Box-sc-g0xbh4-0 gjtfVk"><div class="react-tree-show-tree-items"><div data-testid="repos-file-tree-container" class="Box-sc-g0xbh4-0 cOxzdh"><nav aria-label="File Tree Navigation"><span role="status" aria-live="polite" aria-atomic="true" class="_VisuallyHidden__VisuallyHidden-sc-11jhm7a-0 brGdpi"></span><ul role="tree" aria-label="Files" data-truncate-text="true" class="TreeView__UlBox-sc-4ex6b6-0 cJWUiG"><li class="PRIVATE_TreeView-item" tabindex="0" id="RI_PySolr (1).pdf-item" role="treeitem" aria-labelledby=":R39implab:" aria-describedby=":R39implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R39implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R39implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>RI_PySolr (1).pdf</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="commande_curl_solr.pdf-item" role="treeitem" aria-labelledby=":R59implab:" aria-describedby=":R59implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R59implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R59implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>commande_curl_solr.pdf</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="extraire_fichiers.py-item" role="treeitem" aria-labelledby=":R79implab:" aria-describedby=":R79implabH1:" aria-level="1" aria-current="true" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R79implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R79implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>extraire_fichiers.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="main.py-item" role="treeitem" aria-labelledby=":R99implab:" aria-describedby=":R99implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R99implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":R99implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>main.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="notes.txt-item" role="treeitem" aria-labelledby=":Rb9implab:" aria-describedby=":Rb9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rb9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rb9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>notes.txt</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="precision_recall.py-item" role="treeitem" aria-labelledby=":Rd9implab:" aria-describedby=":Rd9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rd9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rd9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>precision_recall.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="requete_resultat_solr.py-item" role="treeitem" aria-labelledby=":Rf9implab:" aria-describedby=":Rf9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rf9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rf9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>requete_resultat_solr.py</span></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="requetes.py-item" role="treeitem" aria-labelledby=":Rh9implab:" aria-describedby=":Rh9implabH1:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1;content-visibility:auto;contain-intrinsic-size:auto 2rem"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rh9implab:" class="PRIVATE_TreeView-item-content"><div class="PRIVATE_VisuallyHidden" aria-hidden="true" id=":Rh9implabH1:"></div><div class="PRIVATE_TreeView-item-visual" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="octicon octicon-file" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2 1.75C2 .784 2.784 0 3.75 0h6.586c.464 0 .909.184 1.237.513l2.914 2.914c.329.328.513.773.513 1.237v9.586A1.75 1.75 0 0 1 13.25 16h-9.5A1.75 1.75 0 0 1 2 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h9.5a.25.25 0 0 0 .25-.25V6h-2.75A1.75 1.75 0 0 1 9 4.25V1.5Zm6.75.062V4.25c0 .138.112.25.25.25h2.688l-.011-.013-2.914-2.914-.013-.011Z"></path></svg></div><span class="PRIVATE_TreeView-item-content-text"><span>requetes.py</span></span></div></div></li></ul></nav></div></div></div></div></div></div><div class="Box-sc-g0xbh4-0 bTBnTW"><div role="slider" aria-label="Draggable pane splitter" aria-valuemin="0" aria-valuemax="0" aria-valuenow="0" aria-valuetext="Pane width 0 pixels" tabindex="0" class="Box-sc-g0xbh4-0 fFMzrG"></div></div></div></div><div class="Box-sc-g0xbh4-0 iKqMNA"><div class="Box-sc-g0xbh4-0"></div><div class="Box-sc-g0xbh4-0 FxAyp"><div data-selector="repos-split-pane-content" tabindex="0" class="Box-sc-g0xbh4-0 leYMvG"><div class="Box-sc-g0xbh4-0 KMPzq"><div class="Box-sc-g0xbh4-0 hfKjHv container"><div class="px-3 pt-3 pb-0" id="StickyHeader"><div class="Box-sc-g0xbh4-0 gZWyZE"><div class="Box-sc-g0xbh4-0 dwYKDk"><div class="Box-sc-g0xbh4-0 iDtIiT"><div class="Box-sc-g0xbh4-0 cEytCf"><nav data-testid="breadcrumbs" aria-labelledby="repos-header-breadcrumb--wide-heading" id="repos-header-breadcrumb--wide" class="Box-sc-g0xbh4-0 fzFXnm"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading" id="repos-header-breadcrumb--wide-heading">Breadcrumbs</h2><ol class="Box-sc-g0xbh4-0 iMnkmv"><li class="Box-sc-g0xbh4-0 ghzDag"><a class="Box-sc-g0xbh4-0 kHuKdh prc-Link-Link-85e08" sx="[object Object]" data-testid="breadcrumbs-repo-link" href="/Alimiji/Solr_utilisation/tree/main">Solr_utilisation</a></li></ol></nav><div data-testid="breadcrumbs-filename" class="Box-sc-g0xbh4-0 ghzDag"><span class="Text__StyledText-sc-17v1xeu-0 iHQnrN" aria-hidden="true">/</span><h1 class="Box-sc-g0xbh4-0 jGhzSQ prc-Heading-Heading-6CmGO" tabindex="-1" id="file-name-id-wide">extraire_fichiers.py</h1></div><button data-component="IconButton" type="button" class="prc-Button-ButtonBase-c50BI ml-2 prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="invisible" aria-describedby=":R3td9lab:-loading-announcement" aria-labelledby=":Rdd9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-copy" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path></svg></button><span class="Tooltip__StyledTooltip-sc-e45c7z-0 fLAhLl CopyToClipboardButton-module__tooltip--Dq1IB" data-direction="nw" aria-label="Copy path" aria-hidden="true" id=":Rdd9lab:">Copy path</span></div></div><div class="react-code-view-header-element--wide"><div class="Box-sc-g0xbh4-0 faNtbn"><div class="d-flex gap-2"> <button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button type="button" class="Box-sc-g0xbh4-0 dwNhzn prc-Button-ButtonBase-c50BI" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R2l6d9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x">Blame</span></span></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button data-component="IconButton" type="button" aria-label="More file actions" title="More file actions" data-testid="more-file-actions-button-nav-menu-wide" aria-haspopup="true" aria-expanded="false" tabindex="0" class="Box-sc-g0xbh4-0 kVRliy prc-Button-ButtonBase-c50BI js-blob-dropdown-click prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R156d9lab:-loading-announcement" id=":R156d9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-kebab-horizontal" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path></svg></button> </div></div></div><div class="react-code-view-header-element--narrow"><div class="Box-sc-g0xbh4-0 faNtbn"><div class="d-flex gap-2"> <button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><button type="button" class="Box-sc-g0xbh4-0 dwNhzn prc-Button-ButtonBase-c50BI" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R2l7d9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x">Blame</span></span></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button data-component="IconButton" type="button" aria-label="More file actions" title="More file actions" data-testid="more-file-actions-button-nav-menu-narrow" aria-haspopup="true" aria-expanded="false" tabindex="0" class="Box-sc-g0xbh4-0 kVRliy prc-Button-ButtonBase-c50BI js-blob-dropdown-click prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="default" aria-describedby=":R157d9lab:-loading-announcement" id=":R157d9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-kebab-horizontal" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path></svg></button> </div></div></div></div></div></div></div></div><div class="Box-sc-g0xbh4-0 dJxjrT react-code-view-bottom-padding"> <div class="Box-sc-g0xbh4-0 eFxKDQ"></div> <!-- --> <!-- --> </div><div class="Box-sc-g0xbh4-0 dJxjrT"> <!-- --> <!-- --> <button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button><div class="d-flex flex-column border rounded-2 mb-3 pl-1"><div class="Box-sc-g0xbh4-0 dzCJzi"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading">Latest commit</h2><div style="width:120px" class="Skeleton Skeleton--text" data-testid="loading"> </div><div class="d-flex flex-shrink-0 gap-2"><div data-testid="latest-commit-details" class="d-none d-sm-flex flex-items-center"></div><div class="d-flex gap-2"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading">History</h2><a href="/Alimiji/Solr_utilisation/commits/main/extraire_fichiers.py" class="prc-Button-ButtonBase-c50BI d-none d-lg-flex LinkButton-module__code-view-link-button--xvCGA flex-items-center fgColor-default" data-loading="false" data-size="small" data-variant="invisible" aria-describedby=":R5dlal9lab:-loading-announcement"><span data-component="buttonContent" data-align="center" class="prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-history" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m.427 1.927 1.215 1.215a8.002 8.002 0 1 1-1.6 5.685.75.75 0 1 1 1.493-.154 6.5 6.5 0 1 0 1.18-4.458l1.358 1.358A.25.25 0 0 1 3.896 6H.25A.25.25 0 0 1 0 5.75V2.104a.25.25 0 0 1 .427-.177ZM7.75 4a.75.75 0 0 1 .75.75v2.992l2.028.812a.75.75 0 0 1-.557 1.392l-2.5-1A.751.751 0 0 1 7 8.25v-3.5A.75.75 0 0 1 7.75 4Z"></path></svg></span><span data-component="text" class="prc-Button-Label-pTQ3x"><span class="fgColor-default">History</span></span></span></a><div class="d-sm-none"></div><div class="d-flex d-lg-none"><span role="tooltip" aria-label="History" id="history-icon-button-tooltip" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-n"><a href="/Alimiji/Solr_utilisation/commits/main/extraire_fichiers.py" class="prc-Button-ButtonBase-c50BI LinkButton-module__code-view-link-button--xvCGA flex-items-center fgColor-default" data-loading="false" data-size="small" data-variant="invisible" aria-describedby=":Rpdlal9lab:-loading-announcement history-icon-button-tooltip"><span data-component="buttonContent" data-align="center" class="prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-history" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m.427 1.927 1.215 1.215a8.002 8.002 0 1 1-1.6 5.685.75.75 0 1 1 1.493-.154 6.5 6.5 0 1 0 1.18-4.458l1.358 1.358A.25.25 0 0 1 3.896 6H.25A.25.25 0 0 1 0 5.75V2.104a.25.25 0 0 1 .427-.177ZM7.75 4a.75.75 0 0 1 .75.75v2.992l2.028.812a.75.75 0 0 1-.557 1.392l-2.5-1A.751.751 0 0 1 7 8.25v-3.5A.75.75 0 0 1 7.75 4Z"></path></svg></span></span></a></span></div></div></div></div></div><div class="Box-sc-g0xbh4-0 ldRxiI"><div class="Box-sc-g0xbh4-0 efRoCL container"><div class="Box-sc-g0xbh4-0 gNAmSV react-code-size-details-banner"><div class="Box-sc-g0xbh4-0 jNEwzY react-code-size-details-banner"><div class="Box-sc-g0xbh4-0 ifyOQK text-mono"><div title="6.04 KB" data-testid="blob-size" class="Truncate__StyledTruncate-sc-23o1d2-0 eAtkQz"><span>171 lines (131 loc) · 6.04 KB</span></div></div></div></div><div class="Box-sc-g0xbh4-0 jdLMhu react-blob-view-header-sticky" id="repos-sticky-header"><div class="Box-sc-g0xbh4-0 tOISc"><div class="react-blob-sticky-header"><div class="Box-sc-g0xbh4-0 hqwSEx"><div class="Box-sc-g0xbh4-0 lzKZY"><div class="Box-sc-g0xbh4-0 fHind"><nav data-testid="breadcrumbs" aria-labelledby="sticky-breadcrumb-heading" id="sticky-breadcrumb" class="Box-sc-g0xbh4-0 fzFXnm"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading" id="sticky-breadcrumb-heading">Breadcrumbs</h2><ol class="Box-sc-g0xbh4-0 iMnkmv"><li class="Box-sc-g0xbh4-0 ghzDag"><a class="Box-sc-g0xbh4-0 kHuKdh prc-Link-Link-85e08" sx="[object Object]" data-testid="breadcrumbs-repo-link" href="/Alimiji/Solr_utilisation/tree/main">Solr_utilisation</a></li></ol></nav><div data-testid="breadcrumbs-filename" class="Box-sc-g0xbh4-0 ghzDag"><span class="Text__StyledText-sc-17v1xeu-0 wcuBT" aria-hidden="true">/</span><h1 class="Box-sc-g0xbh4-0 dnZoUW prc-Heading-Heading-6CmGO" tabindex="-1" id="sticky-file-name-id">extraire_fichiers.py</h1></div></div><button style="--button-color:fg.default" type="button" class="Box-sc-g0xbh4-0 dpNnZU prc-Button-ButtonBase-c50BI" data-loading="false" data-size="small" data-variant="invisible" aria-describedby=":Riptal9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="leadingVisual" class="prc-Button-Visual-2epfX prc-Button-VisualWrap-Db-eB"><svg aria-hidden="true" focusable="false" class="octicon octicon-arrow-up" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M3.47 7.78a.75.75 0 0 1 0-1.06l4.25-4.25a.75.75 0 0 1 1.06 0l4.25 4.25a.751.751 0 0 1-.018 1.042.751.751 0 0 1-1.042.018L9 4.81v7.44a.75.75 0 0 1-1.5 0V4.81L4.53 7.78a.75.75 0 0 1-1.06 0Z"></path></svg></span><span data-component="text" class="prc-Button-Label-pTQ3x">Top</span></span></button></div></div></div><div class="Box-sc-g0xbh4-0 gpHFJV"><h2 class="sr-only ScreenReaderHeading-module__userSelectNone--vW4Cq prc-Heading-Heading-6CmGO" data-testid="screen-reader-heading">File metadata and controls</h2><div class="Box-sc-g0xbh4-0 iNMjfP"><ul aria-label="File view" class="SegmentedControl__SegmentedControlList-sc-1rzig82-0 eYPFoP" data-size="small"><li class="Box-sc-g0xbh4-0 fefCSX" data-selected="true"><button aria-current="true" class="SegmentedControlButton__SegmentedControlButtonStyled-sc-8lkgxl-0 kQyrwv" type="button"><span class="segmentedControl-content"><div class="Box-sc-g0xbh4-0 segmentedControl-text" data-text="Code">Code</div></span></button></li><li class="Box-sc-g0xbh4-0 sulSy"><button aria-current="false" class="SegmentedControlButton__SegmentedControlButtonStyled-sc-8lkgxl-0 gKyOFO" type="button"><span class="segmentedControl-content"><div class="Box-sc-g0xbh4-0 segmentedControl-text" data-text="Blame">Blame</div></span></button></li></ul><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><div class="Box-sc-g0xbh4-0 jNEwzY react-code-size-details-in-header"><div class="Box-sc-g0xbh4-0 ifyOQK text-mono"><div title="6.04 KB" data-testid="blob-size" class="Truncate__StyledTruncate-sc-23o1d2-0 eAtkQz"><span>171 lines (131 loc) · 6.04 KB</span></div></div></div></div><div class="Box-sc-g0xbh4-0 kcLCKF"><div class="Box-sc-g0xbh4-0 pr-0 prc-ButtonGroup-ButtonGroup-vcMeG"><div><button data-component="IconButton" type="button" data-testid="copilot-ask-menu" class="prc-Button-ButtonBase-c50BI AskCopilotButton-module__square--o8kDO prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby="blob-view-header-copilot-icon-loading-announcement" aria-labelledby=":Rbsptal9lab:" id="blob-view-header-copilot-icon"><svg aria-hidden="true" focusable="false" class="octicon octicon-copilot" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M7.998 15.035c-4.562 0-7.873-2.914-7.998-3.749V9.338c.085-.628.677-1.686 1.588-2.065.013-.07.024-.143.036-.218.029-.183.06-.384.126-.612-.201-.508-.254-1.084-.254-1.656 0-.87.128-1.769.693-2.484.579-.733 1.494-1.124 2.724-1.261 1.206-.134 2.262.034 2.944.765.05.053.096.108.139.165.044-.057.094-.112.143-.165.682-.731 1.738-.899 2.944-.765 1.23.137 2.145.528 2.724 1.261.566.715.693 1.614.693 2.484 0 .572-.053 1.148-.254 1.656.066.228.098.429.126.612.012.076.024.148.037.218.924.385 1.522 1.471 1.591 2.095v1.872c0 .766-3.351 3.795-8.002 3.795Zm0-1.485c2.28 0 4.584-1.11 5.002-1.433V7.862l-.023-.116c-.49.21-1.075.291-1.727.291-1.146 0-2.059-.327-2.71-.991A3.222 3.222 0 0 1 8 6.303a3.24 3.24 0 0 1-.544.743c-.65.664-1.563.991-2.71.991-.652 0-1.236-.081-1.727-.291l-.023.116v4.255c.419.323 2.722 1.433 5.002 1.433ZM6.762 2.83c-.193-.206-.637-.413-1.682-.297-1.019.113-1.479.404-1.713.7-.247.312-.369.789-.369 1.554 0 .793.129 1.171.308 1.371.162.181.519.379 1.442.379.853 0 1.339-.235 1.638-.54.315-.322.527-.827.617-1.553.117-.935-.037-1.395-.241-1.614Zm4.155-.297c-1.044-.116-1.488.091-1.681.297-.204.219-.359.679-.242 1.614.091.726.303 1.231.618 1.553.299.305.784.54 1.638.54.922 0 1.28-.198 1.442-.379.179-.2.308-.578.308-1.371 0-.765-.123-1.242-.37-1.554-.233-.296-.693-.587-1.713-.7Z"></path><path d="M6.25 9.037a.75.75 0 0 1 .75.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 .75-.75Zm4.25.75v1.501a.75.75 0 0 1-1.5 0V9.787a.75.75 0 0 1 1.5 0Z"></path></svg></button><span class="Tooltip__StyledTooltip-sc-e45c7z-0 fLAhLl" data-direction="s" aria-hidden="true" id=":Rbsptal9lab:">Ask Copilot about this file</span></div><div></div></div><div class="Box-sc-g0xbh4-0 kVWtTz react-blob-header-edit-and-raw-actions"><div class="Box-sc-g0xbh4-0 prc-ButtonGroup-ButtonGroup-vcMeG"><div><a href="https://github.com/Alimiji/Solr_utilisation/raw/refs/heads/main/extraire_fichiers.py" data-testid="raw-button" class="Box-sc-g0xbh4-0 gWqxTd prc-Button-ButtonBase-c50BI" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":R5csptal9lab:-loading-announcement"><span data-component="buttonContent" class="Box-sc-g0xbh4-0 gUkoLg prc-Button-ButtonContent-HKbr-"><span data-component="text" class="prc-Button-Label-pTQ3x">Raw</span></span></a></div><div><button data-component="IconButton" type="button" aria-label="Copy raw content" data-testid="copy-raw-button" class="prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Rpcsptal9lab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-copy" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path></svg></button></div><div><span role="tooltip" aria-label="Download raw file" id=":Rdcsptal9lab:" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-n"><button data-component="IconButton" type="button" aria-label="Download raw content" data-testid="download-raw-button" class="Box-sc-g0xbh4-0 ivobqY prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Rtcsptal9lab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-download" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M2.75 14A1.75 1.75 0 0 1 1 12.25v-2.5a.75.75 0 0 1 1.5 0v2.5c0 .138.112.25.25.25h10.5a.25.25 0 0 0 .25-.25v-2.5a.75.75 0 0 1 1.5 0v2.5A1.75 1.75 0 0 1 13.25 14Z"></path><path d="M7.25 7.689V2a.75.75 0 0 1 1.5 0v5.689l1.97-1.969a.749.749 0 1 1 1.06 1.06l-3.25 3.25a.749.749 0 0 1-1.06 0L4.22 6.78a.749.749 0 1 1 1.06-1.06l1.97 1.969Z"></path></svg></button></span></div></div><button hidden="" data-testid="raw-button-shortcut" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden="" data-testid="copy-raw-button-shortcut" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden="" data-testid="download-raw-button-shortcut" data-hotkey-scope="read-only-cursor-text-area"></button><a class="js-github-dev-shortcut d-none prc-Link-Link-85e08" href="https://github.dev/"></a><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><a class="js-github-dev-new-tab-shortcut d-none prc-Link-Link-85e08" href="https://github.dev/" target="_blank"></a><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><div class="Box-sc-g0xbh4-0 prc-ButtonGroup-ButtonGroup-vcMeG"><div><span role="tooltip" aria-label="Edit the file in your fork of this project" id=":R6ksptal9lab:" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-nw"><a sx="[object Object]" data-component="IconButton" type="button" aria-label="Edit file" data-testid="edit-button" class="Box-sc-g0xbh4-0 kilKoS prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Rmksptal9lab:-loading-announcement" href="/Alimiji/Solr_utilisation/edit/main/extraire_fichiers.py"><svg aria-hidden="true" focusable="false" class="octicon octicon-pencil" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M11.013 1.427a1.75 1.75 0 0 1 2.474 0l1.086 1.086a1.75 1.75 0 0 1 0 2.474l-8.61 8.61c-.21.21-.47.364-.756.445l-3.251.93a.75.75 0 0 1-.927-.928l.929-3.25c.081-.286.235-.547.445-.758l8.61-8.61Zm.176 4.823L9.75 4.81l-6.286 6.287a.253.253 0 0 0-.064.108l-.558 1.953 1.953-.558a.253.253 0 0 0 .108-.064Zm1.238-3.763a.25.25 0 0 0-.354 0L10.811 3.75l1.439 1.44 1.263-1.263a.25.25 0 0 0 0-.354Z"></path></svg></a></span></div><div><button data-component="IconButton" type="button" aria-label="More edit options" data-testid="more-edit-button" aria-haspopup="true" aria-expanded="false" tabindex="0" class="prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="default" aria-describedby=":Raksptal9lab:-loading-announcement" id=":Raksptal9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-triangle-down" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="m4.427 7.427 3.396 3.396a.25.25 0 0 0 .354 0l3.396-3.396A.25.25 0 0 0 11.396 7H4.604a.25.25 0 0 0-.177.427Z"></path></svg></button></div></div><button hidden="" data-testid="" data-hotkey="e,Shift+E" data-hotkey-scope="read-only-cursor-text-area"></button></div><span role="tooltip" aria-label="Close symbols panel" id=":R5sptal9lab:" class="Tooltip__TooltipBase-sc-17tf59c-0 fiSvBN tooltipped-nw"><button data-component="IconButton" type="button" aria-label="Symbols" aria-pressed="true" aria-expanded="true" aria-controls="symbols-pane" data-testid="symbols-button" class="Box-sc-g0xbh4-0 hySUEo prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="invisible" aria-describedby="symbols-button-loading-announcement" id="symbols-button"><svg aria-hidden="true" focusable="false" class="octicon octicon-code-square" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M0 1.75C0 .784.784 0 1.75 0h12.5C15.216 0 16 .784 16 1.75v12.5A1.75 1.75 0 0 1 14.25 16H1.75A1.75 1.75 0 0 1 0 14.25Zm1.75-.25a.25.25 0 0 0-.25.25v12.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V1.75a.25.25 0 0 0-.25-.25Zm7.47 3.97a.75.75 0 0 1 1.06 0l2 2a.75.75 0 0 1 0 1.06l-2 2a.749.749 0 0 1-1.275-.326.749.749 0 0 1 .215-.734L10.69 8 9.22 6.53a.75.75 0 0 1 0-1.06ZM6.78 6.53 5.31 8l1.47 1.47a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215l-2-2a.75.75 0 0 1 0-1.06l2-2a.751.751 0 0 1 1.042.018.751.751 0 0 1 .018 1.042Z"></path></svg></button></span><div class="react-blob-header-edit-and-raw-actions-combined"><button data-component="IconButton" type="button" aria-label="Edit and raw actions" title="More file actions" data-testid="more-file-actions-button" aria-haspopup="true" aria-expanded="false" tabindex="0" class="Box-sc-g0xbh4-0 itGLhU prc-Button-ButtonBase-c50BI js-blob-dropdown-click prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="small" data-variant="invisible" aria-describedby=":Rnsptal9lab:-loading-announcement" id=":Rnsptal9lab:"><svg aria-hidden="true" focusable="false" class="octicon octicon-kebab-horizontal" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M8 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3ZM1.5 9a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Zm13 0a1.5 1.5 0 1 0 0-3 1.5 1.5 0 0 0 0 3Z"></path></svg></button></div></div></div></div><div></div></div><div class="Box-sc-g0xbh4-0 hycJXc"><section aria-labelledby="file-name-id-wide file-name-id-mobile" class="Box-sc-g0xbh4-0 dceWRL"><div class="Box-sc-g0xbh4-0 dGXHv"><div id="highlighted-line-menu-positioner" class="position-relative"><div id="copilot-button-positioner" class="Box-sc-g0xbh4-0 bpDFns"><div class="Box-sc-g0xbh4-0 iJOeCH"><div class="Box-sc-g0xbh4-0 jewUnv react-code-file-contents" role="presentation" aria-hidden="true" data-tab-size="8" data-paste-markdown-skip="true" data-hpc="true"><div class="react-line-numbers" style="pointer-events:auto"><div data-line-number="1" class="react-line-number react-code-text" style="padding-right:16px">1</div><div data-line-number="2" class="react-line-number react-code-text" style="padding-right:16px">2</div><div data-line-number="3" class="react-line-number react-code-text" style="padding-right:16px">3</div><div data-line-number="4" class="react-line-number react-code-text" style="padding-right:16px">4</div><div data-line-number="5" class="react-line-number react-code-text" style="padding-right:16px">5</div><div data-line-number="6" class="react-line-number react-code-text" style="padding-right:16px">6</div><div data-line-number="7" class="react-line-number react-code-text" style="padding-right:16px">7</div><div data-line-number="8" class="react-line-number react-code-text" style="padding-right:16px">8</div><div data-line-number="9" class="react-line-number react-code-text" style="padding-right:16px">9</div><div data-line-number="10" class="react-line-number react-code-text" style="padding-right:16px">10</div><div data-line-number="11" class="react-line-number react-code-text" style="padding-right:16px">11</div><div data-line-number="12" class="react-line-number react-code-text" style="padding-right:16px">12</div><div data-line-number="13" class="react-line-number react-code-text" style="padding-right:16px">13</div><div data-line-number="14" class="react-line-number react-code-text" style="padding-right:16px">14</div><div data-line-number="15" class="react-line-number react-code-text" style="padding-right:16px">15</div><div data-line-number="16" class="react-line-number react-code-text" style="padding-right:16px">16</div><div data-line-number="17" class="react-line-number react-code-text" style="padding-right:16px">17</div><div data-line-number="18" class="react-line-number react-code-text" style="padding-right:16px">18</div><div data-line-number="19" class="react-line-number react-code-text" style="padding-right:16px">19</div><div data-line-number="20" class="react-line-number react-code-text" style="padding-right:16px">20</div><div data-line-number="21" class="react-line-number react-code-text" style="padding-right:16px">21</div><div data-line-number="22" class="react-line-number react-code-text" style="padding-right:16px">22</div><div data-line-number="23" class="react-line-number react-code-text" style="padding-right:16px">23</div><div data-line-number="24" class="react-line-number react-code-text" style="padding-right:16px">24</div><div data-line-number="25" class="react-line-number react-code-text" style="padding-right:16px">25</div><div data-line-number="26" class="react-line-number react-code-text" style="padding-right:16px">26</div><div data-line-number="27" class="react-line-number react-code-text" style="padding-right:16px">27</div><div data-line-number="28" class="react-line-number react-code-text" style="padding-right:16px">28</div><div data-line-number="29" class="react-line-number react-code-text" style="padding-right:16px">29</div><div data-line-number="30" class="react-line-number react-code-text" style="padding-right:16px">30</div><div data-line-number="31" class="react-line-number react-code-text" style="padding-right:16px">31</div><div data-line-number="32" class="react-line-number react-code-text" style="padding-right:16px">32</div><div data-line-number="33" class="react-line-number react-code-text" style="padding-right:16px">33</div><div data-line-number="34" class="react-line-number react-code-text" style="padding-right:16px">34</div><div data-line-number="35" class="react-line-number react-code-text" style="padding-right:16px">35</div><div data-line-number="36" class="react-line-number react-code-text" style="padding-right:16px">36</div><div data-line-number="37" class="react-line-number react-code-text" style="padding-right:16px">37</div><div data-line-number="38" class="react-line-number react-code-text" style="padding-right:16px">38</div><div data-line-number="39" class="react-line-number react-code-text" style="padding-right:16px">39</div><div data-line-number="40" class="react-line-number react-code-text" style="padding-right:16px">40</div><div data-line-number="41" class="react-line-number react-code-text" style="padding-right:16px">41</div><div data-line-number="42" class="react-line-number react-code-text" style="padding-right:16px">42</div><div data-line-number="43" class="react-line-number react-code-text" style="padding-right:16px">43<span class="Box-sc-g0xbh4-0 cJGaMs"><div aria-label="Collapse code section" role="button" class="Box-sc-g0xbh4-0 iGLarr"><svg aria-hidden="true" focusable="false" class="Octicon-sc-9kayk9-0" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M12.78 5.22a.749.749 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.06 0L3.22 6.28a.749.749 0 1 1 1.06-1.06L8 8.939l3.72-3.719a.749.749 0 0 1 1.06 0Z"></path></svg></div></span></div><div data-line-number="44" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">44</div><div data-line-number="45" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">45</div><div data-line-number="46" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">46</div><div data-line-number="47" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">47</div><div data-line-number="48" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">48</div><div data-line-number="49" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">49</div><div data-line-number="50" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">50</div><div data-line-number="51" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">51</div><div data-line-number="52" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">52</div><div data-line-number="53" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">53</div><div data-line-number="54" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">54</div><div data-line-number="55" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">55</div><div data-line-number="56" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">56</div><div data-line-number="57" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">57</div><div data-line-number="58" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">58</div><div data-line-number="59" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">59</div><div data-line-number="60" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">60</div><div data-line-number="61" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">61</div><div data-line-number="62" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">62</div><div data-line-number="63" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">63</div><div data-line-number="64" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">64</div><div data-line-number="65" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">65</div><div data-line-number="66" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">66</div><div data-line-number="67" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">67</div><div data-line-number="68" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">68</div><div data-line-number="69" class="child-of-line-42  react-line-number react-code-text" style="padding-right:16px">69</div><div data-line-number="70" class="react-line-number react-code-text" style="padding-right:16px">70</div><div data-line-number="71" class="react-line-number react-code-text" style="padding-right:16px">71</div><div data-line-number="72" class="react-line-number react-code-text" style="padding-right:16px">72</div><div data-line-number="73" class="react-line-number react-code-text" style="padding-right:16px">73</div><div data-line-number="74" class="react-line-number react-code-text" style="padding-right:16px">74</div><div data-line-number="75" class="react-line-number react-code-text" style="padding-right:16px">75</div><div data-line-number="76" class="react-line-number react-code-text" style="padding-right:16px">76</div><div data-line-number="77" class="react-line-number react-code-text" style="padding-right:16px">77</div><div data-line-number="78" class="react-line-number react-code-text" style="padding-right:16px">78</div><div data-line-number="79" class="react-line-number react-code-text" style="padding-right:16px">79</div><div data-line-number="80" class="react-line-number react-code-text" style="padding-right:16px">80</div><div data-line-number="81" class="react-line-number react-code-text" style="padding-right:16px">81</div><div data-line-number="82" class="react-line-number react-code-text" style="padding-right:16px">82</div><div data-line-number="83" class="react-line-number react-code-text" style="padding-right:16px">83</div><div data-line-number="84" class="react-line-number react-code-text" style="padding-right:16px">84</div><div data-line-number="85" class="react-line-number react-code-text" style="padding-right:16px">85</div><div data-line-number="86" class="react-line-number react-code-text" style="padding-right:16px">86</div><div data-line-number="87" class="react-line-number react-code-text" style="padding-right:16px">87</div><div data-line-number="88" class="react-line-number react-code-text" style="padding-right:16px">88</div><div data-line-number="89" class="react-line-number react-code-text" style="padding-right:16px">89</div><div data-line-number="90" class="react-line-number react-code-text" style="padding-right:16px">90</div><div data-line-number="91" class="react-line-number react-code-text" style="padding-right:16px">91</div><div data-line-number="92" class="react-line-number react-code-text" style="padding-right:16px">92</div><div data-line-number="93" class="react-line-number react-code-text" style="padding-right:16px">93</div><div data-line-number="94" class="react-line-number react-code-text" style="padding-right:16px">94</div><div data-line-number="95" class="react-line-number react-code-text" style="padding-right:16px">95</div><div data-line-number="96" class="react-line-number react-code-text" style="padding-right:16px">96</div><div data-line-number="97" class="react-line-number react-code-text" style="padding-right:16px">97</div><div data-line-number="98" class="react-line-number react-code-text" style="padding-right:16px">98</div><div data-line-number="99" class="react-line-number react-code-text" style="padding-right:16px">99</div><div data-line-number="100" class="react-line-number react-code-text" style="padding-right:16px">100</div><div data-line-number="101" class="react-line-number react-code-text" style="padding-right:16px">101</div><div data-line-number="102" class="react-line-number react-code-text" style="padding-right:16px">102</div><div data-line-number="103" class="react-line-number react-code-text" style="padding-right:16px">103</div><div data-line-number="104" class="react-line-number react-code-text" style="padding-right:16px">104</div><div data-line-number="105" class="react-line-number react-code-text" style="padding-right:16px">105</div><div data-line-number="106" class="react-line-number react-code-text" style="padding-right:16px">106</div><div data-line-number="107" class="react-line-number react-code-text" style="padding-right:16px">107</div><div data-line-number="108" class="react-line-number react-code-text" style="padding-right:16px">108</div><div data-line-number="109" class="react-line-number react-code-text" style="padding-right:16px">109</div><div data-line-number="110" class="react-line-number react-code-text" style="padding-right:16px">110</div><div data-line-number="111" class="react-line-number react-code-text" style="padding-right:16px">111</div><div data-line-number="112" class="react-line-number react-code-text" style="padding-right:16px">112</div><div data-line-number="113" class="react-line-number react-code-text" style="padding-right:16px">113</div><div data-line-number="114" class="react-line-number react-code-text" style="padding-right:16px">114</div><div data-line-number="115" class="react-line-number react-code-text" style="padding-right:16px">115</div><div data-line-number="116" class="react-line-number react-code-text" style="padding-right:16px">116</div><div data-line-number="117" class="react-line-number react-code-text" style="padding-right:16px">117</div><div data-line-number="118" class="react-line-number react-code-text" style="padding-right:16px">118</div><div data-line-number="119" class="react-line-number react-code-text" style="padding-right:16px">119</div><div data-line-number="120" class="react-line-number react-code-text" style="padding-right:16px">120</div><div data-line-number="121" class="react-line-number react-code-text" style="padding-right:16px">121</div><div data-line-number="122" class="react-line-number react-code-text" style="padding-right:16px">122</div><div data-line-number="123" class="react-line-number react-code-text" style="padding-right:16px">123</div><div data-line-number="124" class="react-line-number react-code-text" style="padding-right:16px">124</div><div data-line-number="125" class="react-line-number react-code-text" style="padding-right:16px">125</div><div data-line-number="126" class="react-line-number react-code-text" style="padding-right:16px">126</div><div data-line-number="127" class="react-line-number react-code-text" style="padding-right:16px">127</div><div data-line-number="128" class="react-line-number react-code-text" style="padding-right:16px">128</div><div data-line-number="129" class="react-line-number react-code-text" style="padding-right:16px">129</div><div data-line-number="130" class="react-line-number react-code-text" style="padding-right:16px">130</div><div data-line-number="131" class="react-line-number react-code-text" style="padding-right:16px">131</div><div data-line-number="132" class="react-line-number react-code-text" style="padding-right:16px">132</div><div data-line-number="133" class="react-line-number react-code-text" style="padding-right:16px">133</div><div data-line-number="134" class="react-line-number react-code-text" style="padding-right:16px">134</div><div data-line-number="135" class="react-line-number react-code-text" style="padding-right:16px">135</div><div data-line-number="136" class="react-line-number react-code-text" style="padding-right:16px">136</div><div data-line-number="137" class="react-line-number react-code-text" style="padding-right:16px">137</div><div data-line-number="138" class="react-line-number react-code-text" style="padding-right:16px">138</div><div data-line-number="139" class="react-line-number react-code-text" style="padding-right:16px">139</div><div data-line-number="140" class="react-line-number react-code-text" style="padding-right:16px">140</div><div data-line-number="141" class="react-line-number react-code-text" style="padding-right:16px">141</div><div data-line-number="142" class="react-line-number react-code-text" style="padding-right:16px">142</div><div data-line-number="143" class="react-line-number react-code-text" style="padding-right:16px">143</div><div data-line-number="144" class="react-line-number react-code-text" style="padding-right:16px">144</div><div data-line-number="145" class="react-line-number react-code-text" style="padding-right:16px">145</div><div data-line-number="146" class="react-line-number react-code-text" style="padding-right:16px">146</div><div data-line-number="147" class="react-line-number react-code-text" style="padding-right:16px">147</div><div data-line-number="148" class="react-line-number react-code-text" style="padding-right:16px">148</div><div data-line-number="149" class="react-line-number react-code-text" style="padding-right:16px">149</div><div data-line-number="150" class="react-line-number react-code-text" style="padding-right:16px">150</div><div data-line-number="151" class="react-line-number react-code-text" style="padding-right:16px">151</div><div data-line-number="152" class="react-line-number react-code-text" style="padding-right:16px">152</div><div data-line-number="153" class="react-line-number react-code-text" style="padding-right:16px">153</div><div data-line-number="154" class="react-line-number react-code-text" style="padding-right:16px">154</div><div data-line-number="155" class="react-line-number react-code-text" style="padding-right:16px">155</div><div data-line-number="156" class="react-line-number react-code-text" style="padding-right:16px">156</div><div data-line-number="157" class="react-line-number react-code-text" style="padding-right:16px">157</div><div data-line-number="158" class="react-line-number react-code-text" style="padding-right:16px">158</div><div data-line-number="159" class="react-line-number react-code-text" style="padding-right:16px">159</div><div data-line-number="160" class="react-line-number react-code-text" style="padding-right:16px">160</div><div data-line-number="161" class="react-line-number react-code-text" style="padding-right:16px">161</div><div data-line-number="162" class="react-line-number react-code-text" style="padding-right:16px">162</div><div data-line-number="163" class="react-line-number react-code-text" style="padding-right:16px">163</div><div data-line-number="164" class="react-line-number react-code-text" style="padding-right:16px">164</div><div data-line-number="165" class="react-line-number react-code-text" style="padding-right:16px">165</div><div data-line-number="166" class="react-line-number react-code-text" style="padding-right:16px">166</div><div data-line-number="167" class="react-line-number react-code-text" style="padding-right:16px">167</div><div data-line-number="168" class="react-line-number react-code-text" style="padding-right:16px">168</div><div data-line-number="169" class="react-line-number react-code-text" style="padding-right:16px">169</div></div><div class="react-code-lines"><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC1" class="react-file-line html-div" data-testid="code-cell" data-line-number="1" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">json</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC2" class="react-file-line html-div" data-testid="code-cell" data-line-number="2" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC3" class="react-file-line html-div" data-testid="code-cell" data-line-number="3" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">gzip</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC4" class="react-file-line html-div" data-testid="code-cell" data-line-number="4" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">shutil</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC5" class="react-file-line html-div" data-testid="code-cell" data-line-number="5" style="position:relative"><span class="pl-k">from</span> <span class="pl-s1">requetes</span> <span class="pl-k">import</span> <span class="pl-s1">extraire_requetes_longues</span>, <span class="pl-s1">extraire_requetes_courtes</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC6" class="react-file-line html-div" data-testid="code-cell" data-line-number="6" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC7" class="react-file-line html-div" data-testid="code-cell" data-line-number="7" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC8" class="react-file-line html-div" data-testid="code-cell" data-line-number="8" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">gzip</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC9" class="react-file-line html-div" data-testid="code-cell" data-line-number="9" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">shutil</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC10" class="react-file-line html-div" data-testid="code-cell" data-line-number="10" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC11" class="react-file-line html-div" data-testid="code-cell" data-line-number="11" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC12" class="react-file-line html-div" data-testid="code-cell" data-line-number="12" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">gzip</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC13" class="react-file-line html-div" data-testid="code-cell" data-line-number="13" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">shutil</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC14" class="react-file-line html-div" data-testid="code-cell" data-line-number="14" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC15" class="react-file-line html-div" data-testid="code-cell" data-line-number="15" style="position:relative"><span class="pl-c"># Chemins des dossiers</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC16" class="react-file-line html-div" data-testid="code-cell" data-line-number="16" style="position:relative"><span class="pl-s1">input_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC17" class="react-file-line html-div" data-testid="code-cell" data-line-number="17" style="position:relative"><span class="pl-s1">output_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP_ok&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC18" class="react-file-line html-div" data-testid="code-cell" data-line-number="18" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC19" class="react-file-line html-div" data-testid="code-cell" data-line-number="19" style="position:relative"><span class="pl-c"># Vérifie que le dossier de sortie existe, sinon le crée</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC20" class="react-file-line html-div" data-testid="code-cell" data-line-number="20" style="position:relative"><span class="pl-s1">os</span>.<span class="pl-c1">makedirs</span>(<span class="pl-s1">output_folder</span>, <span class="pl-s1">exist_ok</span><span class="pl-c1">=</span><span class="pl-c1">True</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC21" class="react-file-line html-div" data-testid="code-cell" data-line-number="21" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC22" class="react-file-line html-div" data-testid="code-cell" data-line-number="22" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC23" class="react-file-line html-div" data-testid="code-cell" data-line-number="23" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC24" class="react-file-line html-div" data-testid="code-cell" data-line-number="24" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">os</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC25" class="react-file-line html-div" data-testid="code-cell" data-line-number="25" style="position:relative"><span class="pl-k">import</span> <span class="pl-s1">chardet</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC26" class="react-file-line html-div" data-testid="code-cell" data-line-number="26" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC27" class="react-file-line html-div" data-testid="code-cell" data-line-number="27" style="position:relative"><span class="pl-c"># Chemins des dossiers</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC28" class="react-file-line html-div" data-testid="code-cell" data-line-number="28" style="position:relative"><span class="pl-s1">input_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP__ok&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC29" class="react-file-line html-div" data-testid="code-cell" data-line-number="29" style="position:relative"><span class="pl-s1">output_folder</span> <span class="pl-c1">=</span> <span class="pl-s">&#039;/home/alimijileking/PycharmProjects/Solr_project/AP_fixed&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC30" class="react-file-line html-div" data-testid="code-cell" data-line-number="30" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC31" class="react-file-line html-div" data-testid="code-cell" data-line-number="31" style="position:relative"><span class="pl-c"># Crée le dossier de sortie s&#039;il n&#039;existe pas</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC32" class="react-file-line html-div" data-testid="code-cell" data-line-number="32" style="position:relative"><span class="pl-s1">os</span>.<span class="pl-c1">makedirs</span>(<span class="pl-s1">output_folder</span>, <span class="pl-s1">exist_ok</span><span class="pl-c1">=</span><span class="pl-c1">True</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC33" class="react-file-line html-div" data-testid="code-cell" data-line-number="33" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC34" class="react-file-line html-div" data-testid="code-cell" data-line-number="34" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC35" class="react-file-line html-div" data-testid="code-cell" data-line-number="35" style="position:relative"><span class="pl-c"># Fonction pour détecter l&#039;encodage</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC36" class="react-file-line html-div" data-testid="code-cell" data-line-number="36" style="position:relative"><span class="pl-k">def</span> <span class="pl-en">detect_encoding</span>(<span class="pl-s1">file_path</span>):</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC37" class="react-file-line html-div" data-testid="code-cell" data-line-number="37" style="position:relative">    <span class="pl-k">with</span> <span class="pl-en">open</span>(<span class="pl-s1">file_path</span>, <span class="pl-s">&#039;rb&#039;</span>) <span class="pl-k">as</span> <span class="pl-s1">f</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC38" class="react-file-line html-div" data-testid="code-cell" data-line-number="38" style="position:relative">        <span class="pl-s1">result</span> <span class="pl-c1">=</span> <span class="pl-s1">chardet</span>.<span class="pl-c1">detect</span>(<span class="pl-s1">f</span>.<span class="pl-c1">read</span>())</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC39" class="react-file-line html-div" data-testid="code-cell" data-line-number="39" style="position:relative">        <span class="pl-k">return</span> <span class="pl-s1">result</span>[<span class="pl-s">&#039;encoding&#039;</span>]</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC40" class="react-file-line html-div" data-testid="code-cell" data-line-number="40" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC41" class="react-file-line html-div" data-testid="code-cell" data-line-number="41" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC42" class="react-file-line html-div" data-testid="code-cell" data-line-number="42" style="position:relative"><span class="pl-c"># Fonction pour transformer un document en structure XML Solr</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC43" class="react-file-line html-div" data-testid="code-cell" data-line-number="43" style="position:relative"><span class="pl-k">def</span> <span class="pl-en">transform_document</span>(<span class="pl-s1">lines</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC44" class="react-file-line html-div" data-testid="code-cell" data-line-number="44" style="position:relative">    <span class="pl-s1">doc_lines</span> <span class="pl-c1">=</span> []</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC45" class="react-file-line html-div" data-testid="code-cell" data-line-number="45" style="position:relative">    <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">&quot;  &lt;doc&gt;&quot;</span>)  <span class="pl-c"># Début du document</span></div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC46" class="react-file-line html-div" data-testid="code-cell" data-line-number="46" style="position:relative">    <span class="pl-k">for</span> <span class="pl-s1">line</span> <span class="pl-c1">in</span> <span class="pl-s1">lines</span>:</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC47" class="react-file-line html-div" data-testid="code-cell" data-line-number="47" style="position:relative">        <span class="pl-s1">line</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC48" class="react-file-line html-div" data-testid="code-cell" data-line-number="48" style="position:relative">        <span class="pl-k">if</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;DOCNO&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC49" class="react-file-line html-div" data-testid="code-cell" data-line-number="49" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;DOCNO&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/DOCNO&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC50" class="react-file-line html-div" data-testid="code-cell" data-line-number="50" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>DOCNO<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC51" class="react-file-line html-div" data-testid="code-cell" data-line-number="51" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;FILEID&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC52" class="react-file-line html-div" data-testid="code-cell" data-line-number="52" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;FILEID&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/FILEID&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC53" class="react-file-line html-div" data-testid="code-cell" data-line-number="53" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>FILEID<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC54" class="react-file-line html-div" data-testid="code-cell" data-line-number="54" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;FIRST&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC55" class="react-file-line html-div" data-testid="code-cell" data-line-number="55" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;FIRST&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/FIRST&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC56" class="react-file-line html-div" data-testid="code-cell" data-line-number="56" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>FIRST<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC57" class="react-file-line html-div" data-testid="code-cell" data-line-number="57" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;SECOND&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC58" class="react-file-line html-div" data-testid="code-cell" data-line-number="58" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;SECOND&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/SECOND&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC59" class="react-file-line html-div" data-testid="code-cell" data-line-number="59" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>SECOND<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC60" class="react-file-line html-div" data-testid="code-cell" data-line-number="60" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;HEAD&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC61" class="react-file-line html-div" data-testid="code-cell" data-line-number="61" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;HEAD&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/HEAD&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC62" class="react-file-line html-div" data-testid="code-cell" data-line-number="62" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>HEAD<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC63" class="react-file-line html-div" data-testid="code-cell" data-line-number="63" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;DATELINE&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC64" class="react-file-line html-div" data-testid="code-cell" data-line-number="64" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;DATELINE&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/DATELINE&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC65" class="react-file-line html-div" data-testid="code-cell" data-line-number="65" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>DATELINE<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC66" class="react-file-line html-div" data-testid="code-cell" data-line-number="66" style="position:relative">        <span class="pl-k">elif</span> <span class="pl-s1">line</span>.<span class="pl-c1">startswith</span>(<span class="pl-s">&quot;&lt;TEXT&gt;&quot;</span>):</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC67" class="react-file-line html-div" data-testid="code-cell" data-line-number="67" style="position:relative">            <span class="pl-s1">content</span> <span class="pl-c1">=</span> <span class="pl-s1">line</span>.<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;TEXT&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">replace</span>(<span class="pl-s">&quot;&lt;/TEXT&gt;&quot;</span>, <span class="pl-s">&quot;&quot;</span>).<span class="pl-c1">strip</span>()</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC68" class="react-file-line html-div" data-testid="code-cell" data-line-number="68" style="position:relative">            <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">f&quot;    &lt;field name=<span class="pl-cce">\&quot;</span>TEXT<span class="pl-cce">\&quot;</span>&gt;<span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">content</span><span class="pl-kos">}</span></span>&lt;/field&gt;&quot;</span>)</div></div></div><div class="child-of-line-42  react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC69" class="react-file-line html-div" data-testid="code-cell" data-line-number="69" style="position:relative">    <span class="pl-s1">doc_lines</span>.<span class="pl-c1">append</span>(<span class="pl-s">&quot;  &lt;/doc&gt;&quot;</span>)  <span class="pl-c"># Fin du document</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC70" class="react-file-line html-div" data-testid="code-cell" data-line-number="70" style="position:relative">    <span class="pl-k">return</span> <span class="pl-s1">doc_lines</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC71" class="react-file-line html-div" data-testid="code-cell" data-line-number="71" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC72" class="react-file-line html-div" data-testid="code-cell" data-line-number="72" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC73" class="react-file-line html-div" data-testid="code-cell" data-line-number="73" style="position:relative"><span class="pl-c"># Parcourt tous les fichiers XML dans le dossier d&#039;entrée</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC74" class="react-file-line html-div" data-testid="code-cell" data-line-number="74" style="position:relative"><span class="pl-k">for</span> <span class="pl-s1">filename</span> <span class="pl-c1">in</span> <span class="pl-s1">os</span>.<span class="pl-c1">listdir</span>(<span class="pl-s1">input_folder</span>):</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC75" class="react-file-line html-div" data-testid="code-cell" data-line-number="75" style="position:relative">    <span class="pl-k">if</span> <span class="pl-s1">filename</span>.<span class="pl-c1">endswith</span>(<span class="pl-s">&#039;.xml&#039;</span>):</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC76" class="react-file-line html-div" data-testid="code-cell" data-line-number="76" style="position:relative">        <span class="pl-s1">input_path</span> <span class="pl-c1">=</span> <span class="pl-s1">os</span>.<span class="pl-c1">path</span>.<span class="pl-c1">join</span>(<span class="pl-s1">input_folder</span>, <span class="pl-s1">filename</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC77" class="react-file-line html-div" data-testid="code-cell" data-line-number="77" style="position:relative">        <span class="pl-s1">output_path</span> <span class="pl-c1">=</span> <span class="pl-s1">os</span>.<span class="pl-c1">path</span>.<span class="pl-c1">join</span>(<span class="pl-s1">output_folder</span>, <span class="pl-s1">filename</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC78" class="react-file-line html-div" data-testid="code-cell" data-line-number="78" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC79" class="react-file-line html-div" data-testid="code-cell" data-line-number="79" style="position:relative">        <span class="pl-k">try</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC80" class="react-file-line html-div" data-testid="code-cell" data-line-number="80" style="position:relative">            <span class="pl-c"># Détecte l&#039;encodage</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC81" class="react-file-line html-div" data-testid="code-cell" data-line-number="81" style="position:relative">            <span class="pl-s1">encoding</span> <span class="pl-c1">=</span> <span class="pl-en">detect_encoding</span>(<span class="pl-s1">input_path</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC82" class="react-file-line html-div" data-testid="code-cell" data-line-number="82" style="position:relative">            <span class="pl-en">print</span>(<span class="pl-s">f&quot;Encodage détecté pour <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">filename</span><span class="pl-kos">}</span></span>: <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">encoding</span><span class="pl-kos">}</span></span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC83" class="react-file-line html-div" data-testid="code-cell" data-line-number="83" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC84" class="react-file-line html-div" data-testid="code-cell" data-line-number="84" style="position:relative">            <span class="pl-c"># Lit le fichier avec l&#039;encodage détecté</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC85" class="react-file-line html-div" data-testid="code-cell" data-line-number="85" style="position:relative">            <span class="pl-k">with</span> <span class="pl-en">open</span>(<span class="pl-s1">input_path</span>, <span class="pl-s">&#039;r&#039;</span>, <span class="pl-s1">encoding</span><span class="pl-c1">=</span><span class="pl-s1">encoding</span>, <span class="pl-s1">errors</span><span class="pl-c1">=</span><span class="pl-s">&#039;ignore&#039;</span>) <span class="pl-k">as</span> <span class="pl-s1">file</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC86" class="react-file-line html-div" data-testid="code-cell" data-line-number="86" style="position:relative">                <span class="pl-s1">lines</span> <span class="pl-c1">=</span> <span class="pl-s1">file</span>.<span class="pl-c1">readlines</span>()</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC87" class="react-file-line html-div" data-testid="code-cell" data-line-number="87" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC88" class="react-file-line html-div" data-testid="code-cell" data-line-number="88" style="position:relative">            <span class="pl-c"># Transforme et écrit dans le fichier de sortie</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC89" class="react-file-line html-div" data-testid="code-cell" data-line-number="89" style="position:relative">            <span class="pl-k">with</span> <span class="pl-en">open</span>(<span class="pl-s1">output_path</span>, <span class="pl-s">&#039;w&#039;</span>, <span class="pl-s1">encoding</span><span class="pl-c1">=</span><span class="pl-s">&#039;utf-8&#039;</span>) <span class="pl-k">as</span> <span class="pl-s1">out_file</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC90" class="react-file-line html-div" data-testid="code-cell" data-line-number="90" style="position:relative">                <span class="pl-s1">out_file</span>.<span class="pl-c1">write</span>(<span class="pl-s">&quot;&lt;add&gt;<span class="pl-cce">\n</span>&quot;</span>)  <span class="pl-c"># Début de la racine Solr</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC91" class="react-file-line html-div" data-testid="code-cell" data-line-number="91" style="position:relative">                <span class="pl-s1">current_doc</span> <span class="pl-c1">=</span> []</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC92" class="react-file-line html-div" data-testid="code-cell" data-line-number="92" style="position:relative">                <span class="pl-s1">in_doc</span> <span class="pl-c1">=</span> <span class="pl-c1">False</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC93" class="react-file-line html-div" data-testid="code-cell" data-line-number="93" style="position:relative">                <span class="pl-k">for</span> <span class="pl-s1">line</span> <span class="pl-c1">in</span> <span class="pl-s1">lines</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC94" class="react-file-line html-div" data-testid="code-cell" data-line-number="94" style="position:relative">                    <span class="pl-k">if</span> <span class="pl-s">&quot;&lt;DOC&gt;&quot;</span> <span class="pl-c1">in</span> <span class="pl-s1">line</span>:  <span class="pl-c"># Début d&#039;un document</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC95" class="react-file-line html-div" data-testid="code-cell" data-line-number="95" style="position:relative">                        <span class="pl-s1">in_doc</span> <span class="pl-c1">=</span> <span class="pl-c1">True</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC96" class="react-file-line html-div" data-testid="code-cell" data-line-number="96" style="position:relative">                        <span class="pl-s1">current_doc</span> <span class="pl-c1">=</span> []</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC97" class="react-file-line html-div" data-testid="code-cell" data-line-number="97" style="position:relative">                    <span class="pl-k">elif</span> <span class="pl-s">&quot;&lt;/DOC&gt;&quot;</span> <span class="pl-c1">in</span> <span class="pl-s1">line</span>:  <span class="pl-c"># Fin d&#039;un document</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC98" class="react-file-line html-div" data-testid="code-cell" data-line-number="98" style="position:relative">                        <span class="pl-s1">in_doc</span> <span class="pl-c1">=</span> <span class="pl-c1">False</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC99" class="react-file-line html-div" data-testid="code-cell" data-line-number="99" style="position:relative">                        <span class="pl-c"># Transforme le document et l&#039;écrit dans le fichier</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC100" class="react-file-line html-div" data-testid="code-cell" data-line-number="100" style="position:relative">                        <span class="pl-s1">transformed_doc</span> <span class="pl-c1">=</span> <span class="pl-en">transform_document</span>(<span class="pl-s1">current_doc</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC101" class="react-file-line html-div" data-testid="code-cell" data-line-number="101" style="position:relative">                        <span class="pl-s1">out_file</span>.<span class="pl-c1">write</span>(<span class="pl-s">&quot;<span class="pl-cce">\n</span>&quot;</span>.<span class="pl-c1">join</span>(<span class="pl-s1">transformed_doc</span>) <span class="pl-c1">+</span> <span class="pl-s">&quot;<span class="pl-cce">\n</span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC102" class="react-file-line html-div" data-testid="code-cell" data-line-number="102" style="position:relative">                    <span class="pl-k">elif</span> <span class="pl-s1">in_doc</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC103" class="react-file-line html-div" data-testid="code-cell" data-line-number="103" style="position:relative">                        <span class="pl-s1">current_doc</span>.<span class="pl-c1">append</span>(<span class="pl-s1">line</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC104" class="react-file-line html-div" data-testid="code-cell" data-line-number="104" style="position:relative">                <span class="pl-s1">out_file</span>.<span class="pl-c1">write</span>(<span class="pl-s">&quot;&lt;/add&gt;<span class="pl-cce">\n</span>&quot;</span>)  <span class="pl-c"># Fin de la racine Solr</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC105" class="react-file-line html-div" data-testid="code-cell" data-line-number="105" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC106" class="react-file-line html-div" data-testid="code-cell" data-line-number="106" style="position:relative">            <span class="pl-en">print</span>(<span class="pl-s">f&quot;Fichier corrigé et converti : <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">output_path</span><span class="pl-kos">}</span></span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC107" class="react-file-line html-div" data-testid="code-cell" data-line-number="107" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC108" class="react-file-line html-div" data-testid="code-cell" data-line-number="108" style="position:relative">        <span class="pl-k">except</span> <span class="pl-v">Exception</span> <span class="pl-k">as</span> <span class="pl-s1">e</span>:</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC109" class="react-file-line html-div" data-testid="code-cell" data-line-number="109" style="position:relative">            <span class="pl-en">print</span>(<span class="pl-s">f&quot;Erreur lors du traitement de <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">filename</span><span class="pl-kos">}</span></span>: <span class="pl-s1"><span class="pl-kos">{</span><span class="pl-s1">e</span><span class="pl-kos">}</span></span>&quot;</span>)</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC110" class="react-file-line html-div" data-testid="code-cell" data-line-number="110" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC111" class="react-file-line html-div" data-testid="code-cell" data-line-number="111" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC112" class="react-file-line html-div" data-testid="code-cell" data-line-number="112" style="position:relative"><span class="pl-s"># Parcourt tous les fichiers du dossier d&#039;entrée</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC113" class="react-file-line html-div" data-testid="code-cell" data-line-number="113" style="position:relative"><span class="pl-s">for filename in os.listdir(input_folder):</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC114" class="react-file-line html-div" data-testid="code-cell" data-line-number="114" style="position:relative"><span class="pl-s">    if filename.endswith(&#039;.gz&#039;):  # Vérifie si le fichier est au format .gz</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC115" class="react-file-line html-div" data-testid="code-cell" data-line-number="115" style="position:relative"><span class="pl-s">        input_path = os.path.join(input_folder, filename)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC116" class="react-file-line html-div" data-testid="code-cell" data-line-number="116" style="position:relative"><span class="pl-s">        output_path = os.path.join(output_folder, filename[:-3] + &#039;.xml&#039;)  # Supprime &#039;.gz&#039; et ajoute &#039;.xml&#039;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC117" class="react-file-line html-div" data-testid="code-cell" data-line-number="117" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC118" class="react-file-line html-div" data-testid="code-cell" data-line-number="118" style="position:relative"><span class="pl-s">        # Décompresse le fichier et l&#039;écrit directement avec l&#039;extension .xml</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC119" class="react-file-line html-div" data-testid="code-cell" data-line-number="119" style="position:relative"><span class="pl-s">        with gzip.open(input_path, &#039;rb&#039;) as f_in:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC120" class="react-file-line html-div" data-testid="code-cell" data-line-number="120" style="position:relative"><span class="pl-s">            with open(output_path, &#039;wb&#039;) as f_out:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC121" class="react-file-line html-div" data-testid="code-cell" data-line-number="121" style="position:relative"><span class="pl-s">                shutil.copyfileobj(f_in, f_out)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC122" class="react-file-line html-div" data-testid="code-cell" data-line-number="122" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC123" class="react-file-line html-div" data-testid="code-cell" data-line-number="123" style="position:relative"><span class="pl-s">        print(f&quot;Fichier extrait et enregistré en XML : {output_path}&quot;)&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC124" class="react-file-line html-div" data-testid="code-cell" data-line-number="124" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC125" class="react-file-line html-div" data-testid="code-cell" data-line-number="125" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC126" class="react-file-line html-div" data-testid="code-cell" data-line-number="126" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC127" class="react-file-line html-div" data-testid="code-cell" data-line-number="127" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC128" class="react-file-line html-div" data-testid="code-cell" data-line-number="128" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC129" class="react-file-line html-div" data-testid="code-cell" data-line-number="129" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC130" class="react-file-line html-div" data-testid="code-cell" data-line-number="130" style="position:relative"><span class="pl-s"># Creation des requetes longues</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC131" class="react-file-line html-div" data-testid="code-cell" data-line-number="131" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC132" class="react-file-line html-div" data-testid="code-cell" data-line-number="132" style="position:relative"><span class="pl-s"># Liste des fichiers à traiter</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC133" class="react-file-line html-div" data-testid="code-cell" data-line-number="133" style="position:relative"><span class="pl-s">files = [&#039;Topics-requetes/topics.1-50.txt&#039;, &#039;Topics-requetes/topics.51-100.txt&#039;, &#039;Topics-requetes/topics.101-150.txt&#039;]</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC134" class="react-file-line html-div" data-testid="code-cell" data-line-number="134" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC135" class="react-file-line html-div" data-testid="code-cell" data-line-number="135" style="position:relative"><span class="pl-s"># Dictionnaire pour stocker les résultats</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC136" class="react-file-line html-div" data-testid="code-cell" data-line-number="136" style="position:relative"><span class="pl-s">def lire_fichier(filepath):</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC137" class="react-file-line html-div" data-testid="code-cell" data-line-number="137" style="position:relative"><span class="pl-s">    with open(filepath, &#039;r&#039;, encoding=&#039;utf-8&#039;) as file:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC138" class="react-file-line html-div" data-testid="code-cell" data-line-number="138" style="position:relative"><span class="pl-s">        return file.read()</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC139" class="react-file-line html-div" data-testid="code-cell" data-line-number="139" style="position:relative"><span class="pl-s">req_longues_combines = {}</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC140" class="react-file-line html-div" data-testid="code-cell" data-line-number="140" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC141" class="react-file-line html-div" data-testid="code-cell" data-line-number="141" style="position:relative"><span class="pl-s">req_courtes_combines = {}</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC142" class="react-file-line html-div" data-testid="code-cell" data-line-number="142" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC143" class="react-file-line html-div" data-testid="code-cell" data-line-number="143" style="position:relative"><span class="pl-s">for fichier in files:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC144" class="react-file-line html-div" data-testid="code-cell" data-line-number="144" style="position:relative"><span class="pl-s">    data = lire_fichier(fichier)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC145" class="react-file-line html-div" data-testid="code-cell" data-line-number="145" style="position:relative"><span class="pl-s">    resultat = extraire_requetes_longues(data)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC146" class="react-file-line html-div" data-testid="code-cell" data-line-number="146" style="position:relative"><span class="pl-s">    req_longues_combines.update(resultat)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC147" class="react-file-line html-div" data-testid="code-cell" data-line-number="147" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC148" class="react-file-line html-div" data-testid="code-cell" data-line-number="148" style="position:relative"><span class="pl-s">for fichier in files:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC149" class="react-file-line html-div" data-testid="code-cell" data-line-number="149" style="position:relative"><span class="pl-s">    data = lire_fichier(fichier)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC150" class="react-file-line html-div" data-testid="code-cell" data-line-number="150" style="position:relative"><span class="pl-s">    resultat = extraire_requetes_courtes(data)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC151" class="react-file-line html-div" data-testid="code-cell" data-line-number="151" style="position:relative"><span class="pl-s">    req_courtes_combines.update(resultat)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC152" class="react-file-line html-div" data-testid="code-cell" data-line-number="152" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC153" class="react-file-line html-div" data-testid="code-cell" data-line-number="153" style="position:relative"><span class="pl-c"># Afficher les résultats combinés</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC154" class="react-file-line html-div" data-testid="code-cell" data-line-number="154" style="position:relative"><span class="pl-c">#print(resultats_combines)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC155" class="react-file-line html-div" data-testid="code-cell" data-line-number="155" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC156" class="react-file-line html-div" data-testid="code-cell" data-line-number="156" style="position:relative"><span class="pl-c"># Convertion en fichier json</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC157" class="react-file-line html-div" data-testid="code-cell" data-line-number="157" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC158" class="react-file-line html-div" data-testid="code-cell" data-line-number="158" style="position:relative"><span class="pl-c"># Conversion en fichier JSON</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC159" class="react-file-line html-div" data-testid="code-cell" data-line-number="159" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC160" class="react-file-line html-div" data-testid="code-cell" data-line-number="160" style="position:relative"><span class="pl-s">with open(&#039;requetes/requetes_longues.json&#039;, &#039;w&#039;, encoding=&#039;utf-8&#039;) as fichier_json:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC161" class="react-file-line html-div" data-testid="code-cell" data-line-number="161" style="position:relative"><span class="pl-s">    json.dump( req_longues_combines, fichier_json, ensure_ascii=False, indent=4)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC162" class="react-file-line html-div" data-testid="code-cell" data-line-number="162" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC163" class="react-file-line html-div" data-testid="code-cell" data-line-number="163" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC164" class="react-file-line html-div" data-testid="code-cell" data-line-number="164" style="position:relative"><span class="pl-s"># Creation des requetes courtes</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC165" class="react-file-line html-div" data-testid="code-cell" data-line-number="165" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC166" class="react-file-line html-div" data-testid="code-cell" data-line-number="166" style="position:relative"><span class="pl-s">with open(&#039;requetes/requetes_courtes.json&#039;, &#039;w&#039;, encoding=&#039;utf-8&#039;) as fichier_json:</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC167" class="react-file-line html-div" data-testid="code-cell" data-line-number="167" style="position:relative"><span class="pl-s">    json.dump( req_courtes_combines, fichier_json, ensure_ascii=False, indent=4)</span></div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC168" class="react-file-line html-div" data-testid="code-cell" data-line-number="168" style="position:relative">
</div></div></div><div class="react-code-text react-code-line-contents" style="min-height:auto"><div><div id="LC169" class="react-file-line html-div" data-testid="code-cell" data-line-number="169" style="position:relative"><span class="pl-s">&quot;&quot;&quot;</span></div></div></div></div></div></div><div id="copilot-button-container"></div></div><div id="highlighted-line-menu-container"></div></div></div><button hidden="" data-testid="hotkey-button" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button></section></div></div><div class="Box-sc-g0xbh4-0 mgQhK"></div><div class="Box-sc-g0xbh4-0 ipeRWy panel-content-narrow-styles inner-panel-content-not-narrow"><div id="symbols-pane"><div aria-labelledby="symbols-pane-header" class="Box-sc-g0xbh4-0 cxUsTr"><div class="Box-sc-g0xbh4-0 jXkPPw"><h2 id="symbols-pane-header" tabindex="-1" class="Box-sc-g0xbh4-0 hECgeo">Symbols</h2><button data-component="IconButton" type="button" aria-label="Close symbols" data-hotkey="Escape" class="Box-sc-g0xbh4-0 fotqAA prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj" data-loading="false" data-no-visuals="true" data-size="medium" data-variant="invisible" aria-describedby=":R12qtal9lab:-loading-announcement"><svg aria-hidden="true" focusable="false" class="octicon octicon-x" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path></svg></button></div><div class="Box-sc-g0xbh4-0 hoyhab">Find definitions and references for functions and other symbols in this file by clicking a symbol below or in the code.</div><span class="TextInputWrapper__StyledTextInputBaseWrapper-sc-1mqhpbi-0 zEBjf TextInputWrapper__StyledTextInputWrapper-sc-1mqhpbi-1 kOcqDw TextInput-wrapper" data-block="true" data-trailing-action="true" data-leading-visual="true" data-trailing-visual="true" aria-busy="false"><span class="TextInput-icon" id=":R6qtal9lab:" aria-hidden="true"><svg aria-hidden="true" focusable="false" class="Octicon-sc-9kayk9-0" viewBox="0 0 16 16" width="16" height="16" fill="currentColor" style="display:inline-block;user-select:none;vertical-align:text-bottom;overflow:visible"><path d="M.75 3h14.5a.75.75 0 0 1 0 1.5H.75a.75.75 0 0 1 0-1.5ZM3 7.75A.75.75 0 0 1 3.75 7h8.5a.75.75 0 0 1 0 1.5h-8.5A.75.75 0 0 1 3 7.75Zm3 4a.75.75 0 0 1 .75-.75h2.5a.75.75 0 0 1 0 1.5h-2.5a.75.75 0 0 1-.75-.75Z"></path></svg></span><input type="text" placeholder="Filter symbols" name="Filter symbols" aria-label="Filter symbols" aria-controls="filter-results" aria-expanded="true" aria-autocomplete="list" role="combobox" aria-describedby=":R6qtal9lab: :R6qtal9labH1:" data-component="input" class="UnstyledTextInput__ToggledUnstyledTextInput-sc-14ypya-0 jkNcAv" value=""/><span class="TextInput-icon" id=":R6qtal9labH1:" aria-hidden="true"><div class="Box-sc-g0xbh4-0 gqhZpQ"><kbd>r</kbd></div></span></span><div class="Box-sc-g0xbh4-0 ccgkJf"><div id="filter-results" class="Box-sc-g0xbh4-0 kACRto"><span role="status" aria-live="polite" aria-atomic="true" class="_VisuallyHidden__VisuallyHidden-sc-11jhm7a-0 brGdpi"></span><ul role="tree" aria-label="Code Navigation" data-omit-spacer="true" data-truncate-text="true" class="TreeView__UlBox-sc-4ex6b6-0 cJWUiG"><li class="PRIVATE_TreeView-item" tabindex="0" id="0input_folder" role="treeitem" aria-labelledby=":R38qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R38qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="input_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>input_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="1output_folder" role="treeitem" aria-labelledby=":R58qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R58qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="output_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>output_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="2input_folder" role="treeitem" aria-labelledby=":R78qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R78qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="input_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>input_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="3output_folder" role="treeitem" aria-labelledby=":R98qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":R98qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 dotKsF"></div><div class="Box-sc-g0xbh4-0 iGIwaf">const</div></div>  <div title="output_folder" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>output_folder</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="4detect_encoding" role="treeitem" aria-labelledby=":Rb8qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rb8qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 gxAxAi"></div><div class="Box-sc-g0xbh4-0 gWkFIQ">func</div></div>  <div title="detect_encoding" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>detect_encoding</span></div></div></span></div></div></li><li class="PRIVATE_TreeView-item" tabindex="0" id="5transform_document" role="treeitem" aria-labelledby=":Rd8qtal9lab:" aria-level="1" aria-selected="false"><div class="PRIVATE_TreeView-item-container" style="--level:1"><div style="grid-area:spacer;display:flex"><div style="width:100%;display:flex"></div></div><div id=":Rd8qtal9lab:" class="PRIVATE_TreeView-item-content"><span class="PRIVATE_TreeView-item-content-text"><div class="Box-sc-g0xbh4-0 cSURfY"><div class="Box-sc-g0xbh4-0 bTXewe"><div class="Box-sc-g0xbh4-0 gxAxAi"></div><div class="Box-sc-g0xbh4-0 gWkFIQ">func</div></div>  <div title="transform_document" class="Truncate__StyledTruncate-sc-23o1d2-0 btDQYJ"><span>transform_document</span></div></div></span></div></div></li></ul></div></div></div></div></div></div> <!-- --> <!-- --> </div></div></div><div class="Box-sc-g0xbh4-0"></div></div></div></div></div><div id="find-result-marks-container" class="Box-sc-g0xbh4-0 cCoXib"></div><button hidden="" data-testid="" data-hotkey-scope="read-only-cursor-text-area"></button><button hidden=""></button></div> <!-- --> <!-- --> <script type="application/json" id="__PRIMER_DATA_:R0:__">{"resolvedServerColorMode":"night"}</script></div>
</react-app>
</turbo-frame>



  </div>

</turbo-frame>

    </main>
  </div>

  </div>

          <footer class="footer pt-8 pb-6 f6 color-fg-muted p-responsive" role="contentinfo" >
  <h2 class='sr-only'>Footer</h2>

  


  <div class="d-flex flex-justify-center flex-items-center flex-column-reverse flex-lg-row flex-wrap flex-lg-nowrap">
    <div class="d-flex flex-items-center flex-shrink-0 mx-2">
      <a aria-label="Homepage" title="GitHub" class="footer-octicon mr-2" href="https://github.com">
        <svg aria-hidden="true" height="24" viewBox="0 0 24 24" version="1.1" width="24" data-view-component="true" class="octicon octicon-mark-github">
    <path d="M12.5.75C6.146.75 1 5.896 1 12.25c0 5.089 3.292 9.387 7.863 10.91.575.101.79-.244.79-.546 0-.273-.014-1.178-.014-2.142-2.889.532-3.636-.704-3.866-1.35-.13-.331-.69-1.352-1.18-1.625-.402-.216-.977-.748-.014-.762.906-.014 1.553.834 1.769 1.179 1.035 1.74 2.688 1.25 3.349.948.1-.747.402-1.25.733-1.538-2.559-.287-5.232-1.279-5.232-5.678 0-1.25.445-2.285 1.178-3.09-.115-.288-.517-1.467.115-3.048 0 0 .963-.302 3.163 1.179.92-.259 1.897-.388 2.875-.388.977 0 1.955.13 2.875.388 2.2-1.495 3.162-1.179 3.162-1.179.633 1.581.23 2.76.115 3.048.733.805 1.179 1.825 1.179 3.09 0 4.413-2.688 5.39-5.247 5.678.417.36.776 1.05.776 2.128 0 1.538-.014 2.774-.014 3.162 0 .302.216.662.79.547C20.709 21.637 24 17.324 24 12.25 24 5.896 18.854.75 12.5.75Z"></path>
</svg>
</a>
      <span>
        &copy; 2025 GitHub,&nbsp;Inc.
      </span>
    </div>

    <nav aria-label="Footer">
      <h3 class="sr-only" id="sr-footer-heading">Footer navigation</h3>

      <ul class="list-style-none d-flex flex-justify-center flex-wrap mb-2 mb-lg-0" aria-labelledby="sr-footer-heading">

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to Terms&quot;,&quot;label&quot;:&quot;text:terms&quot;}" href="https://docs.github.com/site-policy/github-terms/github-terms-of-service" data-view-component="true" class="Link--secondary Link">Terms</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to privacy&quot;,&quot;label&quot;:&quot;text:privacy&quot;}" href="https://docs.github.com/site-policy/privacy-policies/github-privacy-statement" data-view-component="true" class="Link--secondary Link">Privacy</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to security&quot;,&quot;label&quot;:&quot;text:security&quot;}" href="https://github.com/security" data-view-component="true" class="Link--secondary Link">Security</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to status&quot;,&quot;label&quot;:&quot;text:status&quot;}" href="https://www.githubstatus.com/" data-view-component="true" class="Link--secondary Link">Status</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to docs&quot;,&quot;label&quot;:&quot;text:docs&quot;}" href="https://docs.github.com/" data-view-component="true" class="Link--secondary Link">Docs</a>
          </li>

          <li class="mx-2">
            <a data-analytics-event="{&quot;category&quot;:&quot;Footer&quot;,&quot;action&quot;:&quot;go to contact&quot;,&quot;label&quot;:&quot;text:contact&quot;}" href="https://support.github.com?tags=dotcom-footer" data-view-component="true" class="Link--secondary Link">Contact</a>
          </li>

          <li class="mx-2" >
  <cookie-consent-link>
    <button
      type="button"
      class="Link--secondary underline-on-hover border-0 p-0 color-bg-transparent"
      data-action="click:cookie-consent-link#showConsentManagement"
      data-analytics-event="{&quot;location&quot;:&quot;footer&quot;,&quot;action&quot;:&quot;cookies&quot;,&quot;context&quot;:&quot;subfooter&quot;,&quot;tag&quot;:&quot;link&quot;,&quot;label&quot;:&quot;cookies_link_subfooter_footer&quot;}"
    >
      Manage cookies
    </button>
  </cookie-consent-link>
</li>

<li class="mx-2">
  <cookie-consent-link>
    <button
      type="button"
      class="Link--secondary underline-on-hover border-0 p-0 color-bg-transparent"
      data-action="click:cookie-consent-link#showConsentManagement"
      data-analytics-event="{&quot;location&quot;:&quot;footer&quot;,&quot;action&quot;:&quot;dont_share_info&quot;,&quot;context&quot;:&quot;subfooter&quot;,&quot;tag&quot;:&quot;link&quot;,&quot;label&quot;:&quot;dont_share_info_link_subfooter_footer&quot;}"
    >
      Do not share my personal information
    </button>
  </cookie-consent-link>
</li>

      </ul>
    </nav>
  </div>
</footer>



    <ghcc-consent id="ghcc" class="position-fixed bottom-0 left-0" style="z-index: 999999" data-initial-cookie-consent-allowed="" data-cookie-consent-required="true"></ghcc-consent>



  <div id="ajax-error-message" class="ajax-error-message flash flash-error" hidden>
    <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-alert">
    <path d="M6.457 1.047c.659-1.234 2.427-1.234 3.086 0l6.082 11.378A1.75 1.75 0 0 1 14.082 15H1.918a1.75 1.75 0 0 1-1.543-2.575Zm1.763.707a.25.25 0 0 0-.44 0L1.698 13.132a.25.25 0 0 0 .22.368h12.164a.25.25 0 0 0 .22-.368Zm.53 3.996v2.5a.75.75 0 0 1-1.5 0v-2.5a.75.75 0 0 1 1.5 0ZM9 11a1 1 0 1 1-2 0 1 1 0 0 1 2 0Z"></path>
</svg>
    <button type="button" class="flash-close js-ajax-error-dismiss" aria-label="Dismiss error">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
    </button>
    You can’t perform that action at this time.
  </div>

    <template id="site-details-dialog">
  <details class="details-reset details-overlay details-overlay-dark lh-default color-fg-default hx_rsm" open>
    <summary role="button" aria-label="Close dialog"></summary>
    <details-dialog class="Box Box--overlay d-flex flex-column anim-fade-in fast hx_rsm-dialog hx_rsm-modal">
      <button class="Box-btn-octicon m-0 btn-octicon position-absolute right-0 top-0" type="button" aria-label="Close dialog" data-close-dialog>
        <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-x">
    <path d="M3.72 3.72a.75.75 0 0 1 1.06 0L8 6.94l3.22-3.22a.749.749 0 0 1 1.275.326.749.749 0 0 1-.215.734L9.06 8l3.22 3.22a.749.749 0 0 1-.326 1.275.749.749 0 0 1-.734-.215L8 9.06l-3.22 3.22a.751.751 0 0 1-1.042-.018.751.751 0 0 1-.018-1.042L6.94 8 3.72 4.78a.75.75 0 0 1 0-1.06Z"></path>
</svg>
      </button>
      <div class="octocat-spinner my-6 js-details-dialog-spinner"></div>
    </details-dialog>
  </details>
</template>

    <div class="Popover js-hovercard-content position-absolute" style="display: none; outline: none;">
  <div class="Popover-message Popover-message--bottom-left Popover-message--large Box color-shadow-large" style="width:360px;">
  </div>
</div>

    <template id="snippet-clipboard-copy-button">
  <div class="zeroclipboard-container position-absolute right-0 top-0">
    <clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0" data-copy-feedback="Copied!" data-tooltip-direction="w">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
    <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-check js-clipboard-check-icon color-fg-success d-none m-2">
    <path d="M13.78 4.22a.75.75 0 0 1 0 1.06l-7.25 7.25a.75.75 0 0 1-1.06 0L2.22 9.28a.751.751 0 0 1 .018-1.042.751.751 0 0 1 1.042-.018L6 10.94l6.72-6.72a.75.75 0 0 1 1.06 0Z"></path>
</svg>
    </clipboard-copy>
  </div>
</template>
<template id="snippet-clipboard-copy-button-unpositioned">
  <div class="zeroclipboard-container">
    <clipboard-copy aria-label="Copy" class="ClipboardButton btn btn-invisible js-clipboard-copy m-2 p-0 d-flex flex-justify-center flex-items-center" data-copy-feedback="Copied!" data-tooltip-direction="w">
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon">
    <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-check js-clipboard-check-icon color-fg-success d-none">
    <path d="M13.78 4.22a.75.75 0 0 1 0 1.06l-7.25 7.25a.75.75 0 0 1-1.06 0L2.22 9.28a.751.751 0 0 1 .018-1.042.751.751 0 0 1 1.042-.018L6 10.94l6.72-6.72a.75.75 0 0 1 1.06 0Z"></path>
</svg>
    </clipboard-copy>
  </div>
</template>


    <style>
      .user-mention[href$="/DominiqueLoyer"] {
        color: var(--color-user-mention-fg);
        background-color: var(--bgColor-attention-muted, var(--color-attention-subtle));
        border-radius: 2px;
        margin-left: -2px;
        margin-right: -2px;
      }
      .user-mention[href$="/DominiqueLoyer"]:before,
      .user-mention[href$="/DominiqueLoyer"]:after {
        content: '';
        display: inline-block;
        width: 2px;
      }
    </style>


    </div>

    <div id="js-global-screen-reader-notice" class="sr-only mt-n1" aria-live="polite" aria-atomic="true" ></div>
    <div id="js-global-screen-reader-notice-assertive" class="sr-only mt-n1" aria-live="assertive" aria-atomic="true"></div>
  </body>
</html>

Pour citer ce code :

Loyer, Dominique. (2024). extraire_fichiers copie (trashed 2025-05-30 11-48-03).py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

flaskBackend.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# -*- coding: utf-8 -*-
"""Copie de Backend Flask (app.py)

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1YfExv7KBPl9bh7bTBrgqL3NnLaRLEmn0
"""

# Import necessary libraries
from flask import Flask, request, jsonify
# Install flask_cors if not already installed
try:
    from flask_cors import CORS # To handle Cross-Origin Resource Sharing
except ImportError:
    print("flask_cors not found. Installing...")
    !pip install flask_cors
    from flask_cors import CORS
    print("flask_cors installed successfully.")

import sys
import os
import traceback # To help with debugging errors

# --- Configuration Import ---
# Add the directory containing 'credibility_system.py' to the Python path
# Adjust this path if your file structure is different
# Assumes 'credibility_system.py' is in the same directory as 'app.py'
# If it's elsewhere, provide the correct path.
# Example: sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'path_to_your_module')))

# Initialize credibility_checker to None, so we can check if it was successfully imported
credibility_checker = None

try:
    # Assuming your CredibilityVerificationSystem class is in a file named 'credibility_system.py'
    # in the same directory as this 'app.py' file.
    from credibility_system import CredibilityVerificationSystem
    print("Successfully imported CredibilityVerificationSystem.")
    # --- Instantiate the System ---
    # Load models only once when the server starts
    # This can take time, so the server might be slow to start initially.
    print("Initializing CredibilityVerificationSystem... (This might take a moment)")
    credibility_checker = CredibilityVerificationSystem()
    print("CredibilityVerificationSystem initialized.")

except ImportError:
    print("Error: Could not import CredibilityVerificationSystem.")
    print("Please ensure 'credibility_system.py' exists and is in the correct path.")
    print(f"Current sys.path: {sys.path}")
    # Removed sys.exit() to avoid stopping the notebook execution
    # The credibility_checker will remain None, and the API endpoint
    # will need to handle this case.
    print("CredibilityVerificationSystem module not found. Flask app will start, but /api/verify will not work.")
except Exception as e:
    print(f"Error initializing CredibilityVerificationSystem: {e}")
    traceback.print_exc() # Print detailed error traceback
    # Removed sys.exit()
    print("Failed to initialize credibility system. Flask app will start, but /api/verify will not work.")


# --- Flask App Initialization ---
app = Flask(__name__)
# IMPORTANT: Enable CORS to allow requests from your HTML file (frontend)
# For development, allow all origins ('*'). For production, restrict this
# to the actual domain where your frontend is hosted.
CORS(app)
print("Flask app created and CORS enabled.")

# --- API Route Definition ---
@app.route('/api/verify', methods=['POST'])
def verify_endpoint():
    """
    API endpoint to receive input data and return credibility analysis.
    Expects a JSON payload with the key 'input_data'.
    """
    print("\nReceived request on /api/verify")

    # Check if the credibility system was initialized successfully
    if credibility_checker is None:
        print("Error: Credibility verification system not initialized.")
        return jsonify({"error": "Credibility verification system is not available."}), 503 # Service Unavailable


    # 1. Get data from the request
    if not request.is_json:
        print("Error: Request is not JSON")
        return jsonify({"error": "Request must be JSON"}), 400 # Bad Request

    data = request.get_json()
    input_data = data.get('input_data', None)

    if not input_data or not isinstance(input_data, str) or not input_data.strip():
        print("Error: 'input_data' is missing or invalid")
        return jsonify({"error": "'input_data' field is required and must be a non-empty string"}), 400

    print(f"Processing input: {input_data[:100]}...") # Log received data (truncated)

    # 2. Call the credibility verification system
    try:
        # Use the pre-initialized checker instance
        results = credibility_checker.verify_information(input_data)
        print("Verification successful.")
        # Check if the verification itself returned an error structure
        if isinstance(results, dict) and 'error' in results:
             print(f"Verification returned an error: {results['error']}")
             # Return the specific error from the verification logic
             # Use 400 (Bad Request) or potentially 500 (Internal Server Error)
             # depending on the nature of the error. 400 is often suitable if
             # the input caused the issue (e.g., invalid URL, empty text after processing).
             return jsonify(results), 400

        # 3. Return the results as JSON
        print(f"Returning report with score: {results.get('scoreCredibilite', 'N/A')}")
        return jsonify(results), 200 # OK

    except Exception as e:
        # Catch any unexpected errors during verification
        print(f"Error during verification process: {e}")
        traceback.print_exc() # Log the full error traceback for debugging
        # Return a generic server error message
        return jsonify({"error": "An internal server error occurred during analysis."}), 500 # Internal Server Error


# --- Run the App (for development) ---
if __name__ == '__main__':
    # Runs the Flask development server.
    # host='0.0.0.0' makes it accessible on your network (useful for testing from other devices)
    # debug=True provides automatic reloading and more detailed error pages (DO NOT use in production)
    print("Starting Flask development server...")
    # Flask's development server might restart multiple times when debug=True,
    # leading to re-execution of the code cell. This is normal behavior.
    app.run(host='0.0.0.0', port=5000, debug=True)

Pour citer ce code :

Loyer, Dominique. (2024). flaskBackend.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

Gemini_Export.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# -*- coding: utf-8 -*-

# =============================================================================
# EXPORTATEUR DE CLAVARDAGE GEMINI EN PDF (v3.2 - Version Simplifiée)
#
# Description :
# Ce script n'utilise plus la bibliothèque 'selenium-stealth' qui causait
# des problèmes. Il se repose uniquement sur le chargement du profil Chrome
# pour l'authentification.
#
# Auteur : Gemini
# Version : 3.2
# =============================================================================

import os
import sys
import time
import json
import platform

# On s'assure que les bibliothèques sont importées
try:
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException, NoSuchElementException
    from webdriver_manager.chrome import ChromeDriverManager
except ImportError as e:
    print(f"ERREUR CRITIQUE : Une bibliothèque essentielle est manquante : {e.name}")
    print("Veuillez vous assurer d'avoir activé l'environnement virtuel (venv)")
    print("et d'y avoir installé les dépendances avec la commande :")
    print("pip install selenium webdriver-manager")
    sys.exit(1)

# =============================================================================
# SECTION DE CONFIGURATION
# =============================================================================

CHROME_PROFILE_PATH = "/Users/bk280625/Library/Application Support/Google/Chrome"
AUTO_DISCOVER_URLS = True
GEMINI_URLS = []
OUTPUT_DIRECTORY = "Gemini_Exports_PDF"
SCROLL_ATTEMPTS = 30


# =============================================================================
# FONCTIONS DU SCRIPT
# =============================================================================

def setup_driver(profile_path, download_dir):
    """Configure le pilote Chrome avec le gestionnaire auto et le profil."""
    options = webdriver.ChromeOptions()
    options.add_argument(f"user-data-dir={profile_path}")
    
    settings = {
        "recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}],
        "selectedDestinationId": "Save as PDF",
        "version": 2,
        "isHeaderFooterEnabled": False,
        "isCssBackgroundEnabled": True
    }
    prefs = {
        'printing.print_preview_sticky_settings.appState': json.dumps(settings),
        'savefile.default_directory': download_dir,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "download.safebrowsing.enabled": True
    }
    options.add_experimental_option('prefs', prefs)
    options.add_argument('--kiosk-printing')
    
    try:
        print("-> Configuration du pilote Chrome...")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        return driver
    except Exception as e:
        print(f"ERREUR : Impossible de démarrer Chrome.")
        print(f"Détail de l'erreur : {e}")
        return None

def discover_gemini_urls(driver, scroll_attempts):
    """Navigue et récupère tous les liens de conversation."""
    print("\n--- Démarrage de la découverte automatique des URLs ---")
    driver.get("https://gemini.google.com/app")
    
    wait = WebDriverWait(driver, 60)
    
    try:
        print("-> Attente du chargement de l'interface principale...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "rich-textarea")))
        print("-> Interface principale détectée.")
        time.sleep(2)

        print("-> Attente de l'apparition de l'historique des conversations...")
        first_link_selector = "a[href*='/app/c/']"
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, first_link_selector)))
        print("-> Historique détecté et visible.")
        
    except TimeoutException:
        print("\nERREUR CRITIQUE : Impossible de trouver l'historique.")
        return []

    print(f"-> Défilement pour charger l'historique complet ({scroll_attempts} tentatives)...")
    try:
        scroll_target = driver.find_element(By.CSS_SELECTOR, "nav")
        for i in range(scroll_attempts):
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_target)
            print(f"  Défilement du panneau {i+1}/{scroll_attempts}...")
            time.sleep(2)
    except NoSuchElementException:
        print("-> Panneau de navigation non trouvé, défilement de la page entière...")
        for i in range(scroll_attempts):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print(f"  Défilement de la page {i+1}/{scroll_attempts}...")
            time.sleep(2)

    print("-> Recherche de tous les liens de conversation après défilement...")
    urls = set()
    try:
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/app/c/']")
        for link in links:
            href = link.get_attribute('href')
            if href:
                urls.add(href)
    except Exception as e:
        print(f"ERREUR inattendue lors de la collecte des liens : {e}")
        return []

    print(f"-> {len(urls)} conversations uniques trouvées.")
    return list(urls)


def sanitize_filename(title):
    """Nettoie un titre pour en faire un nom de fichier valide."""
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        title = title.replace(char, '_')
    return title[:150].strip()

def export_chat_to_pdf(driver, url, output_dir):
    """Navigue vers une URL Gemini et l'exporte en PDF."""
    try:
        print(f"\nTraitement de l'URL : {url.split('/')[-1]}")
        driver.get(url)
        print("-> Attente du chargement de la conversation...")
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".message-content")))
        print("-> Conversation chargée.")
        
        time.sleep(2)
        page_title = driver.title
        
        if "Gemini" in page_title and len(page_title) < 20:
             chat_id = url.split('/')[-1]
             file_name = f"gemini_chat_{chat_id}.pdf"
        else:
             file_name = sanitize_filename(page_title) + ".pdf"
        
        print(f"-> Lancement de l'impression en PDF...")
        driver.execute_script("window.print();")
        
        time.sleep(5)
        
        print(f"-> Sauvegardé (probablement) sous : {file_name}")
        return True

    except TimeoutException:
        print("ERREUR : Le chargement de la page de conversation a pris trop de temps.")
        return False
    except Exception as e:
        print(f"ERREUR inattendue lors du traitement de {url}: {e}")
        return False

# =============================================================================
# SCRIPT PRINCIPAL
# =============================================================================

if __name__ == "__main__":
    print("========================================")
    print("  Exportateur de Clavardage Gemini PDF  ")
    print("========================================")

    profile_path = CHROME_PROFILE_PATH
    if not profile_path or not os.path.exists(profile_path):
        print("\nERREUR : Chemin du profil Chrome non trouvé ou invalide.")
        sys.exit(1)

    print(f"Utilisation du profil Chrome : {profile_path}")
    print("IMPORTANT : Assurez-vous que Google Chrome est complètement fermé.")
    time.sleep(3)

    if not os.path.exists(OUTPUT_DIRECTORY):
        print(f"\nCréation du dossier de sortie : {OUTPUT_DIRECTORY}")
        os.makedirs(OUTPUT_DIRECTORY)
    
    absolute_output_path = os.path.abspath(OUTPUT_DIRECTORY)
    
    driver = setup_driver(profile_path, absolute_output_path)

    if driver:
        urls_to_process = []
        if AUTO_DISCOVER_URLS:
            urls_to_process = discover_gemini_urls(driver, SCROLL_ATTEMPTS)
        else:
            urls_to_process = GEMINI_URLS

        if not urls_to_process:
            print("\nAucune URL à traiter. Fin du script.")
        else:
            print(f"\n--- Démarrage de l'exportation de {len(urls_to_process)} conversations ---")
            success_count = 0
            fail_count = 0
            for i, url in enumerate(urls_to_process):
                print(f"\n--- Progression : Conversation {i+1}/{len(urls_to_process)} ---")
                if export_chat_to_pdf(driver, url, absolute_output_path):
                    success_count += 1
                else:
                    fail_count += 1
            
            print("\n----------------------------------------")
            print("Exportation terminée !")
            print(f"Conversations sauvegardées : {success_count}")
            print(f"Échecs : {fail_count}")
            print(f"Les fichiers PDF se trouvent dans : {absolute_output_path}")
        
        driver.quit()

Pour citer ce code :

Loyer, Dominique. (2024). Gemini_Export.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

interface806.html

Erreur lors de la génération de la description.

Mots-clés: erreur, api

<!DOCTYPE html>
<html lang="fr">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Évaluation de Crédibilité</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdn.jsdelivr.net/npm/lucide-static@latest/dist/lucide.min.js"></script>
    <style>
        /* Style personnalisé pour la jauge */
        .gauge-container {
            width: 200px;
            height: 100px;
            position: relative;
            overflow: hidden;
            border-radius: 100px 100px 0 0;
        }
        .gauge-background {
            width: 100%;
            height: 100%;
            background: linear-gradient(to right, #ef4444, #eab308, #22c55e); /* Red-Yellow-Green */
            position: absolute;
            top: 0;
            left: 0;
        }
        .gauge-mask {
            width: 100%;
            height: 100%;
            background-color: #f3f4f6; /* bg-gray-100 */
            position: absolute;
            top: 0;
            left: 0;
            transform-origin: bottom center;
            /* La rotation sera ajustée par JS */
            transition: transform 0.5s ease-in-out;
        }
        .gauge-center {
            width: 160px;
            height: 80px;
            background-color: #f3f4f6; /* bg-gray-100 */
            position: absolute;
            bottom: 0;
            left: 20px;
            border-radius: 80px 80px 0 0;
            display: flex;
            flex-direction: column;
            justify-content: flex-end;
            align-items: center;
            padding-bottom: 5px;
        }
        .gauge-score {
            font-size: 1.5rem; /* text-2xl */
            font-weight: bold;
        }
        .gauge-label {
            font-size: 0.75rem; /* text-xs */
            color: #6b7280; /* text-gray-500 */
        }
        /* Style pour surligner les mots (explication LIME) */
        .highlight-positive { background-color: rgba(34, 197, 94, 0.3); padding: 0 2px; border-radius: 3px; }
        .highlight-negative { background-color: rgba(239, 68, 68, 0.3); padding: 0 2px; border-radius: 3px; }

        /* Style pour les barres simples */
        .bar-container {
            height: 10px;
            background-color: #e5e7eb; /* bg-gray-200 */
            border-radius: 5px;
            overflow: hidden;
            width: 100px; /* Ajuster si nécessaire */
        }
        .bar {
            height: 100%;
            border-radius: 5px;
            transition: width 0.5s ease-in-out;
        }
    </style>
</head>
<body class="bg-gray-100 font-sans p-4 md:p-8">

    <div class="container mx-auto max-w-3xl bg-white shadow-lg rounded-lg p-6 md:p-8">

        <h1 class="text-2xl md:text-3xl font-bold text-center text-gray-800 mb-6">
            Système d'Évaluation de la Crédibilité de l'Information
        </h1>

        <div class="mb-6">
            <label for="inputData" class="block text-sm font-medium text-gray-700 mb-2">
                Entrez une URL ou collez du texte :
            </label>
            <textarea id="inputData" rows="4" class="w-full p-3 border border-gray-300 rounded-md focus:ring-2 focus:ring-indigo-500 focus:border-indigo-500 transition duration-150 ease-in-out" placeholder="Ex: https://www.example.com ou 'Ce texte semble suspect...'"></textarea>
            <button id="verifyButton" class="mt-3 w-full inline-flex justify-center items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500 transition duration-150 ease-in-out disabled:opacity-50">
                <i data-lucide="search" class="mr-2 h-5 w-5"></i> Vérifier la Crédibilité
            </button>
        </div>

        <div id="reportSection" class="hidden mt-8 border-t border-gray-200 pt-6">
            <h2 class="text-xl md:text-2xl font-semibold text-gray-800 mb-4 text-center">Rapport d'Analyse</h2>

            <div class="flex flex-col items-center mb-6 p-4 bg-gray-50 rounded-lg">
                <h3 class="text-lg font-medium text-gray-700 mb-2">Score de Crédibilité Global</h3>
                <div class="gauge-container mb-2">
                    <div class="gauge-background"></div>
                    <div id="gaugeMask" class="gauge-mask"></div>
                    <div class="gauge-center">
                        <span id="gaugeScore" class="gauge-score">--</span>
                        <span class="gauge-label">0 = Faible, 1 = Élevé</span>
                    </div>
                </div>
                <p id="reportSummary" class="text-center text-gray-600 text-sm italic"></p>
            </div>

            <div class="grid grid-cols-1 md:grid-cols-2 gap-6">

                <div class="bg-gray-50 p-4 rounded-lg shadow-sm">
                    <h4 class="font-semibold text-gray-700 mb-3 border-b pb-2 flex items-center">
                        <i data-lucide="clipboard-list" class="mr-2 h-5 w-5 text-blue-600"></i> Analyse Basée sur les Règles
                    </h4>
                    <div class="space-y-3 text-sm">
                        <div class="flex justify-between items-center">
                            <span class="text-gray-600 flex items-center"><i data-lucide="shield-check" class="mr-1.5 h-4 w-4 text-gray-400"></i>Réputation Source :</span>
                            <span id="repSource" class="font-medium text-gray-800 px-2 py-0.5 rounded">Inconnue</span>
                        </div>
                        <div class="flex justify-between items-center">
                            <span class="text-gray-600 flex items-center"><i data-lucide="calendar-days" class="mr-1.5 h-4 w-4 text-gray-400"></i>Âge Domaine :</span>
                            <span id="ageDomain" class="font-medium text-gray-800">N/A</span>
                        </div>
                        <div class="pt-2">
                            <p class="text-gray-600 mb-1 font-medium">Marqueurs Linguistiques :</p>
                            <div class="flex justify-between items-center pl-4">
                                <span class="text-gray-500 flex items-center"><i data-lucide="megaphone" class="mr-1.5 h-4 w-4 text-yellow-500"></i>Sensationnalisme :</span>
                                <span id="lingSens" class="font-medium text-gray-800">0</span>
                            </div>
                            <div class="flex justify-between items-center pl-4">
                                <span class="text-gray-500 flex items-center"><i data-lucide="check-circle" class="mr-1.5 h-4 w-4 text-green-500"></i>Certitude :</span>
                                <span id="lingCert" class="font-medium text-gray-800">0</span>
                            </div>
                            <div class="flex justify-between items-center pl-4">
                                <span class="text-gray-500 flex items-center"><i data-lucide="alert-circle" class="mr-1.5 h-4 w-4 text-red-500"></i>Doute :</span>
                                <span id="lingDoubt" class="font-medium text-gray-800">0</span>
                            </div>
                        </div>
                         <div class="pt-2">
                            <p class="text-gray-600 mb-1 font-medium flex items-center">
                               <i data-lucide="search-check" class="mr-1.5 h-4 w-4 text-blue-500"></i> Vérifications Externes (Fact-Checks) :
                            </p>
                            <ul id="factChecksList" class="list-disc list-inside pl-4 text-gray-500">
                                <li>Aucune trouvée.</li>
                            </ul>
                        </div>
                    </div>
                </div>

                <div class="bg-gray-50 p-4 rounded-lg shadow-sm">
                    <h4 class="font-semibold text-gray-700 mb-3 border-b pb-2 flex items-center">
                        <i data-lucide="brain-circuit" class="mr-2 h-5 w-5 text-purple-600"></i> Analyse par IA (NLP)
                    </h4>
                    <div class="space-y-3 text-sm">
                        <div class="flex justify-between items-center">
                            <span class="text-gray-600 flex items-center"><i data-lucide="smile" class="mr-1.5 h-4 w-4 text-gray-400"></i>Sentiment :</span>
                            <div>
                                <span id="sentimentLabel" class="font-medium text-gray-800 px-2 py-0.5 rounded">Neutre</span>
                                (<span id="sentimentScore" class="text-gray-500">--</span>)
                            </div>
                        </div>
                         <div class="pt-1">
                             <p class="text-gray-600 mb-1 font-medium flex items-center">
                                <i data-lucide="highlighter" class="mr-1.5 h-4 w-4 text-gray-400"></i> Mots Clés (Sentiment) :
                            </p>
                            <p id="sentimentExplanation" class="text-gray-500 pl-4 text-xs italic">
                               (Explication non disponible)
                            </p>
                        </div>
                        <div class="flex justify-between items-center">
                            <span class="text-gray-600 flex items-center"><i data-lucide="swords" class="mr-1.5 h-4 w-4 text-gray-400"></i>Biais Potentiel :</span>
                             <div>
                                <span id="biasLabel" class="font-medium text-gray-800 px-2 py-0.5 rounded">Non Détecté</span>
                                (<span id="biasScore" class="text-gray-500">--</span>)
                             </div>
                        </div>
                        <div class="flex justify-between items-center">
                            <span class="text-gray-600 flex items-center"><i data-lucide="puzzle" class="mr-1.5 h-4 w-4 text-gray-400"></i>Cohérence (Sim.) :</span>
                            <div class="bar-container">
                                <div id="coherenceBar" class="bar bg-blue-500"></div>
                            </div>
                         </div>
                        <div class="flex justify-between items-center">
                            <span class="text-gray-600 flex items-center"><i data-lucide="tags" class="mr-1.5 h-4 w-4 text-gray-400"></i>Entités Reconnues :</span>
                            <span id="nerCount" class="font-medium text-gray-800">0</span>
                        </div>
                    </div>
                </div>
            </div>

             <div class="mt-6 bg-gray-50 p-4 rounded-lg shadow-sm">
                 <h4 class="font-semibold text-gray-700 mb-2 text-sm flex items-center">
                    <i data-lucide="file-text" class="mr-2 h-4 w-4 text-gray-500"></i> Texte Analysé (Extrait)
                 </h4>
                 <p id="originalTextPreview" class="text-xs text-gray-500 italic max-h-20 overflow-y-auto bg-white p-2 rounded border border-gray-200"></p>
            </div>

        </div> 
        
        <div id="loadingIndicator" class="hidden text-center mt-6">
             <div class="inline-flex items-center px-4 py-2 font-semibold leading-6 text-sm shadow rounded-md text-indigo-700 bg-white transition ease-in-out duration-150 cursor-not-allowed">
                <svg class="animate-spin -ml-1 mr-3 h-5 w-5 text-indigo-500" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
                    <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
                    <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
                </svg>
                Analyse en cours...
            </div>
        </div>

         <div id="errorSection" class="hidden mt-6 p-4 bg-red-100 border border-red-400 text-red-700 rounded-lg">
             <h4 class="font-bold flex items-center"><i data-lucide="alert-triangle" class="mr-2 h-5 w-5"></i> Erreur</h4>
             <p id="errorMessage" class="text-sm"></p>
         </div>

    </div>

    <script>
        // FIX: Wrap all JavaScript code in DOMContentLoaded event listener
        document.addEventListener('DOMContentLoaded', () => {
            // --- Éléments DOM ---
            const inputDataEl = document.getElementById('inputData');
            const verifyButton = document.getElementById('verifyButton');
            const reportSection = document.getElementById('reportSection');
            const errorSection = document.getElementById('errorSection');
            const errorMessageEl = document.getElementById('errorMessage');
            const loadingIndicator = document.getElementById('loadingIndicator');

            // Éléments du rapport
            const gaugeScoreEl = document.getElementById('gaugeScore');
            const gaugeMaskEl = document.getElementById('gaugeMask');
            const reportSummaryEl = document.getElementById('reportSummary');
            const repSourceEl = document.getElementById('repSource');
            const ageDomainEl = document.getElementById('ageDomain');
            const lingSensEl = document.getElementById('lingSens');
            const lingCertEl = document.getElementById('lingCert');
            const lingDoubtEl = document.getElementById('lingDoubt');
            const factChecksListEl = document.getElementById('factChecksList');
            const sentimentLabelEl = document.getElementById('sentimentLabel');
            const sentimentScoreEl = document.getElementById('sentimentScore');
            const sentimentExplanationEl = document.getElementById('sentimentExplanation');
            const biasLabelEl = document.getElementById('biasLabel');
            const biasScoreEl = document.getElementById('biasScore');
            const coherenceBarEl = document.getElementById('coherenceBar');
            const nerCountEl = document.getElementById('nerCount');
            const originalTextPreviewEl = document.getElementById('originalTextPreview');

            // Initialiser Lucide Icons
            lucide.createIcons();

            // --- Logique ---
            verifyButton.addEventListener('click', handleVerificationRequest);

            async function handleVerificationRequest() {
                const inputText = inputDataEl.value.trim();
                if (!inputText) {
                    showError("Veuillez entrer une URL ou du texte.");
                    return;
                }

                loadingIndicator.classList.remove('hidden');
                reportSection.classList.add('hidden');
                errorSection.classList.add('hidden');
                verifyButton.disabled = true;

                try {
                    // *** APPEL RÉEL AU BACKEND ***
                    const response = await fetch('http://localhost:5000/api/verify', {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
                        body: JSON.stringify({ input_data: inputText })
                    });
                    
                    const reportData = await response.json(); // Toujours essayer de lire le JSON

                    if (!response.ok) {
                        // Si la réponse n'est pas OK, lancer une erreur avec le message du serveur
                        throw new Error(reportData.error || `Erreur serveur: ${response.status}`);
                    }

                    displayReport(reportData);
                    reportSection.classList.remove('hidden');

                } catch (error) {
                    console.error("Erreur lors de la vérification:", error);
                    showError(error.message || "Une erreur de communication avec le serveur est survenue.");
                } finally {
                    loadingIndicator.classList.add('hidden');
                    verifyButton.disabled = false;
                }
            }

            function displayReport(report) {
                const score = report.scoreCredibilite;
                gaugeScoreEl.textContent = score.toFixed(2);
                const rotation = Math.max(0, Math.min(180, (1 - score) * 180));
                gaugeMaskEl.style.transform = `rotate(${rotation}deg)`;
                reportSummaryEl.textContent = report.resumeAnalyse || "Résumé non disponible.";

                const rules = report.reglesAppliquees;
                const sourceAnalysis = rules.source_analysis || {};
                const linguisticMarkers = rules.linguistic_markers || {};

                const reputation = sourceAnalysis.reputation || 'Inconnue';
                repSourceEl.textContent = reputation;
                repSourceEl.className = `font-medium px-2 py-0.5 rounded ${getReputationColor(reputation)}`;

                const age = sourceAnalysis.domain_age_days;
                ageDomainEl.textContent = age !== null ? `${age} jours` : 'N/A';

                lingSensEl.textContent = linguisticMarkers.sensationalism || 0;
                lingCertEl.textContent = linguisticMarkers.certainty || 0;
                lingDoubtEl.textContent = linguisticMarkers.doubt || 0;

                const factChecks = rules.fact_checking || [];
                factChecksListEl.innerHTML = '';
                if (factChecks.length > 0) {
                    factChecks.forEach(fc => {
                        const li = document.createElement('li');
                        const ratingIcon = fc.rating === 'True' ? '<i data-lucide="check-circle-2" class="inline-block h-3 w-3 text-green-500 mr-1"></i>' :
                                           fc.rating === 'False' ? '<i data-lucide="x-circle" class="inline-block h-3 w-3 text-red-500 mr-1"></i>' :
                                           '<i data-lucide="help-circle" class="inline-block h-3 w-3 text-gray-400 mr-1"></i>';
                        li.innerHTML = `${ratingIcon} ${fc.claim || 'Affirmation inconnue'} (${fc.rating || 'Note inconnue'})`;
                        factChecksListEl.appendChild(li);
                    });
                     lucide.createIcons();
                } else {
                    factChecksListEl.innerHTML = '<li>Aucun trouvé.</li>';
                }

                const nlp = report.analyseNLP || {};

                const sentiment = nlp.sentiment || { label: 'Neutral', score: 0.5 };
                sentimentLabelEl.textContent = sentiment.label || 'Inconnu';
                sentimentScoreEl.textContent = sentiment.score !== undefined ? sentiment.score.toFixed(2) : '--';
                sentimentLabelEl.className = `font-medium px-2 py-0.5 rounded ${getSentimentColor(sentiment.label)}`;

                const explanation = nlp.sentiment_explanation_preview || [];
                if (explanation.length > 0) {
                    sentimentExplanationEl.innerHTML = explanation.map(item =>
                        `<span class="${item[1] > 0 ? 'highlight-positive' : 'highlight-negative'}">${item[0]} (${item[1].toFixed(2)})</span>`
                    ).join(', ');
                } else {
                    sentimentExplanationEl.textContent = "(Explication non disponible)";
                }

                const bias = nlp.bias_analysis || { label: 'Non Détecté', score: null };
                biasLabelEl.textContent = bias.label || 'Inconnu';
                biasScoreEl.textContent = bias.score !== null ? bias.score.toFixed(2) : '--';
                biasLabelEl.className = `font-medium px-2 py-0.5 rounded ${getBiasColor(bias.label)}`;

                const coherence = nlp.coherence_score;
                if (coherence !== null && coherence !== undefined) {
                    coherenceBarEl.style.width = `${Math.max(0, Math.min(100, coherence * 100))}%`;
                } else {
                     coherenceBarEl.style.width = '0%';
                }

                nerCountEl.textContent = nlp.named_entities_count !== undefined ? nlp.named_entities_count : 0;
                
                const originalText = report.reglesAppliquees?.original_text_preview || report.informationEntree;
                originalTextPreviewEl.textContent = originalText.substring(0, 300) + (originalText.length > 300 ? '...' : '');
            }

            function getReputationColor(reputation) {
                switch (reputation?.toLowerCase()) {
                    case 'high': return 'bg-green-100 text-green-800';
                    case 'medium': return 'bg-yellow-100 text-yellow-800';
                    case 'low': return 'bg-red-100 text-red-800';
                    default: return 'bg-gray-100 text-gray-800';
                }
            }

            function getSentimentColor(label) {
                 switch (label?.toUpperCase()) {
                    case 'POSITIVE': return 'bg-green-100 text-green-800';
                    case 'NEGATIVE': return 'bg-red-100 text-red-800';
                    default: return 'bg-gray-100 text-gray-800';
                }
            }

            function getBiasColor(label) {
                 if (label?.toLowerCase().includes('flagged') || label?.toLowerCase().includes('potential')) {
                     return 'bg-yellow-100 text-yellow-800';
                 } else if (label?.toLowerCase().includes('low') || label?.toLowerCase().includes('non détecté')) {
                     return 'bg-green-100 text-green-800';
                 } else {
                      return 'bg-gray-100 text-gray-800';
                 }
            }

            function showError(message) {
                errorMessageEl.textContent = message;
                errorSection.classList.remove('hidden');
                reportSection.classList.add('hidden');
            }
        });
    </script>
</body>
</html>

Pour citer ce code :

Loyer, Dominique. (2024). interface806.html [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

lect_1907.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# -*- coding: utf-8 -*-

# Ce script nécessite l'installation de bibliothèques tierces.
# pip3 install PyPDF2 python-docx gTTS pydub langdetect beautifulsoup4 requests stop-words
#
# IMPORTANT: pydub nécessite FFMPEG. Sur macOS, installez-le avec Homebrew:
# brew install ffmpeg

import os
import re
import collections
from itertools import cycle
import json
import subprocess
import math
import time
import threading

# --- Importations des bibliothèques avec vérification ---
try:
    from pynput import keyboard
except ImportError: print("ERREUR: 'pynput' est requis. `pip3 install pynput`"); exit()
try:
    from gtts import gTTS
except ImportError: print("ERREUR: 'gTTS' est requis. `pip3 install gTTS`"); exit()
try:
    from pydub import AudioSegment
except ImportError: print("ERREUR: 'pydub' est requis. `pip3 install pydub` et `brew install ffmpeg`"); exit()
try:
    from langdetect import detect, LangDetectException
except ImportError: print("ERREUR: 'langdetect' est requis. `pip3 install langdetect`"); exit()
try:
    import requests
except ImportError: print("ERREUR: 'requests' est requis. `pip3 install requests`"); exit()
try:
    from stop_words import get_stop_words
except ImportError: print("ERREUR: 'stop-words' est requis. `pip3 install stop-words`"); exit()
try:
    import PyPDF2
except ImportError: print("AVERTISSEMENT: 'PyPDF2' n'est pas installé."); PyPDF2 = None
try:
    import docx
except ImportError: print("AVERTISSEMENT: 'python-docx' n'est pas installé."); docx = None
try:
    from bs4 import BeautifulSoup
except ImportError: print("AVERTISSEMENT: 'beautifulsoup4' n'est pas installé."); BeautifulSoup = None


# --- Définition des couleurs ---
class Couleurs:
    RESET = '\033[0m'
    BOLD = '\033[1m'
    FG_NOIR = '\033[30m'
    BG_JAUNE = '\033[103m'
    BG_CYAN = '\033[106m'
    BG_MAGENTA = '\033[105m'
    BG_VERT = '\033[102m'
    CITATION_VERT = '\033[92m'

# --- État global de la lecture ---
etat_lecture = {
    "pause": threading.Event(), "quitter": threading.Event(),
    "prochain": threading.Event(), "precedent": threading.Event(),
    "processus_audio": None
}
etat_lecture["pause"].set()

# --- Fonctions de lecture de fichiers ---
def lire_document(chemin_fichier):
    nom_fichier, extension = os.path.splitext(chemin_fichier)
    extension = extension.lower()
    print(f"\n--- Début de l'extraction du texte : {os.path.basename(chemin_fichier)} ---")
    if extension in ['.txt', '.tex']: return lire_fichier_txt(chemin_fichier)
    elif extension == '.pdf': return lire_fichier_pdf(chemin_fichier)
    elif extension == '.docx': return lire_fichier_docx(chemin_fichier)
    elif extension == '.html': return lire_fichier_html(chemin_fichier)
    else: print(f"Erreur : Type de fichier '{extension}' non supporté."); return None

def lire_fichier_txt(chemin_fichier):
    try:
        with open(chemin_fichier, 'r', encoding='utf-8') as f: return f.read()
    except Exception as e: print(f"Erreur lecture TXT: {e}"); return None
def lire_fichier_pdf(chemin_fichier):
    if not PyPDF2: print("ERREUR: PyPDF2 n'est pas disponible."); return None
    texte_complet = ""
    try:
        with open(chemin_fichier, 'rb') as f:
            lecteur_pdf = PyPDF2.PdfReader(f)
            for page in lecteur_pdf.pages: texte_complet += page.extract_text() or ""
        return texte_complet
    except Exception as e: print(f"Erreur lecture PDF: {e}"); return None
def lire_fichier_docx(chemin_fichier):
    if not docx: print("ERREUR: python-docx n'est pas disponible."); return None
    texte_complet = []
    try:
        document = docx.Document(chemin_fichier)
        for p in document.paragraphs: texte_complet.append(p.text)
        return '\n'.join(texte_complet)
    except Exception as e: print(f"Erreur lecture DOCX: {e}"); return None
def lire_fichier_html(chemin_fichier):
    if not BeautifulSoup: print("ERREUR: beautifulsoup4 n'est pas disponible."); return None
    try:
        with open(chemin_fichier, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
            for script_or_style in soup(["script", "style"]): script_or_style.decompose()
            texte = soup.get_text()
            lignes = (line.strip() for line in texte.splitlines())
            morceaux = (phrase.strip() for line in lignes for phrase in line.split("  "))
            return '\n'.join(chunk for chunk in morceaux if chunk)
    except Exception as e: print(f"Erreur lecture HTML: {e}"); return None
def lire_google_doc(url):
    print("--- Tentative de lecture du Google Doc... ---")
    try:
        match = re.search(r'/document/d/([a-zA-Z0-9-_]+)', url)
        if not match: print("ERREUR: URL de Google Doc invalide."); return None
        doc_id = match.group(1)
        export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
        print(f"--- Téléchargement du contenu depuis Google Docs... ---")
        response = requests.get(export_url, timeout=15)
        response.raise_for_status()
        return response.content.decode('utf-8-sig')
    except requests.exceptions.RequestException as e:
        print(f"ERREUR: Impossible de télécharger le Google Doc. Vérifiez le lien et le partage.\n{e}"); return None

def selectionner_source_terminal():
    print("\n" + "="*60)
    print("▶️  Glissez-déposez un fichier OU collez un lien Google Doc,")
    print("   puis appuyez sur la touche [Entrée].")
    print("="*60)
    chemin_brut = input()
    return chemin_brut.strip().strip("'").strip('"')

# --- Fonctions de mémoire et d'extraction des mots-clés ---
def preparer_dossier_export():
    dossier_export = os.path.join(os.path.expanduser("~"), "Desktop", "Lecteur_Exports")
    os.makedirs(dossier_export, exist_ok=True)
    return dossier_export

def charger_memoire(dossier_export, nom_fichier):
    chemin_memoire = os.path.join(dossier_export, nom_fichier)
    if os.path.exists(chemin_memoire):
        with open(chemin_memoire, 'r', encoding='utf-8') as f: return json.load(f)
    return {}

def sauvegarder_memoire(dossier_export, memoire, nom_fichier):
    chemin_memoire = os.path.join(dossier_export, nom_fichier)
    with open(chemin_memoire, 'w', encoding='utf-8') as f: json.dump(memoire, f, indent=4)

def extraire_mots_cles(texte, dossier_export, nombre_mots=100):
    memoire = charger_memoire(dossier_export, "memoire_mots_cles.json")
    print("🧠 Mémoire des mots-clés chargée.")
    stopwords = set(get_stop_words('fr') + get_stop_words('en'))
    noms_a_ignorer = set(['dominique loyer', 'dominique s. loyer'])
    noms_propres = re.findall(r'(?<![.\s])\b[A-Z][a-z]{3,}\b', texte)
    mots = re.findall(r'\b\w+\b', texte.lower())
    mots_filtres = [mot for mot in mots if mot not in stopwords and len(mot) > 3]
    compteur_actuel = collections.Counter(mots_filtres)
    for mot, freq in compteur_actuel.items():
        if mot not in noms_a_ignorer:
            memoire[mot] = memoire.get(mot, 0) + freq
    mots_cles_memoire = sorted(memoire, key=memoire.get, reverse=True)
    mots_cles_combines = list(dict.fromkeys(noms_propres + mots_cles_memoire))
    mots_cles_finaux = [mot for mot in mots_cles_combines if mot.lower() not in stopwords and mot.lower() not in noms_a_ignorer][:nombre_mots]
    return mots_cles_finaux, memoire

def reviser_mots_cles_interactivement(mots_cles_proposes, memoire):
    mots_cles_approuves = list(mots_cles_proposes)
    while True:
        print("\n" + "="*80 + "\n✍️  ATELIER DE MOTS-CLÉS INTERACTIF\n" + "="*80)
        num_mots = len(mots_cles_approuves)
        num_cols = 4
        num_rows = math.ceil(num_mots / num_cols)
        col_width = 20
        for i in range(num_rows):
            line = ""
            for j in range(num_cols):
                index = i + j * num_rows
                if index < num_mots:
                    mot = f"{index+1:2d}. {mots_cles_approuves[index]}"
                    line += mot.ljust(col_width)
            print(line)
        print("\nCommandes:\n  - Numéro pour DÉSAPPROUVER (ex: 5)\n  - '+' suivi d'un mot pour AJOUTER (ex: +cybersécurité)\n  - 'ok' ou [Entrée] pour VALIDER.")
        choix = input("\nVotre choix : ").strip()
        if not choix or choix.lower() == 'ok': break
        elif choix.startswith('+'):
            nouveau_mot = choix[1:].strip().lower()
            if nouveau_mot and nouveau_mot not in mots_cles_approuves:
                mots_cles_approuves.append(nouveau_mot)
                memoire[nouveau_mot] = memoire.get(nouveau_mot, 0) + 100
                print(f"✅ '{nouveau_mot}' ajouté et promu.")
        elif choix.isdigit():
            try:
                index = int(choix) - 1
                if 0 <= index < len(mots_cles_approuves):
                    mot_retire = mots_cles_approuves.pop(index)
                    memoire[mot_retire] = max(0, memoire.get(mot_retire, 0) - 50)
                    print(f"❌ '{mot_retire}' retiré et rétrogradé.")
                else: print("Numéro invalide.")
            except ValueError: print("Entrée non reconnue.")
        else: print("Commande invalide.")
    print(f"\n--- {len(mots_cles_approuves)} Mots-clés finaux validés. ---")
    return mots_cles_approuves, memoire

def surligner_texte(texte, mots_cles):
    texte_colore = texte
    paires_couleurs = [(Couleurs.FG_NOIR, Couleurs.BG_JAUNE), (Couleurs.FG_NOIR, Couleurs.BG_CYAN), (Couleurs.FG_NOIR, Couleurs.BG_MAGENTA), (Couleurs.FG_NOIR, Couleurs.BG_VERT)]
    cycle_couleurs = cycle(paires_couleurs)
    mots_tries = sorted(mots_cles, key=len, reverse=True)
    for mot in mots_tries:
        couleur_texte, couleur_fond = next(cycle_couleurs)
        pattern = r'\b(' + re.escape(mot) + r')\b'
        texte_colore = re.sub(pattern, f"{Couleurs.BOLD}{couleur_fond}{couleur_texte}\\1{Couleurs.RESET}", texte_colore, flags=re.IGNORECASE)
    return texte_colore

def nettoyer_texte_pour_lecture(texte):
    texte_nettoye = texte.replace('-', ' ')
    texte_nettoye = re.sub(r'\(.*?\)', '', texte_nettoye)
    texte_nettoye = re.sub(r'\{.*?\}', '', texte_nettoye)
    texte_nettoye = re.sub(r'\s*\([^)]+\d{4}[^)]*\)', '', texte_nettoye)
    texte_nettoye = re.sub(r'Dominique\s+S?\.\s+Loyer', '', texte_nettoye, flags=re.IGNORECASE)
    return texte_nettoye.strip()

# --- Contrôleur de clavier et lecture audio fiables ---
def arreter_son_en_cours():
    if etat_lecture["processus_audio"] and etat_lecture["processus_audio"].poll() is None:
        etat_lecture["processus_audio"].terminate()
        etat_lecture["processus_audio"].wait()

def controleur_clavier():
    def on_press(key):
        global etat_lecture
        if etat_lecture["quitter"].is_set(): return False
        
        action_detectee = False
        if hasattr(key, 'char'):
            if key.char in ['p', 'q']: action_detectee = True
        elif key in [keyboard.Key.right, keyboard.Key.left]: action_detectee = True
        
        if action_detectee: arreter_son_en_cours()

        if hasattr(key, 'char'):
            if key.char == 'p':
                if etat_lecture["pause"].is_set():
                    print(f"\n--- REPRISE ---", end="", flush=True)
                    etat_lecture["pause"].clear()
                else:
                    print(f"\n--- EN PAUSE ---", end="", flush=True)
                    etat_lecture["pause"].set()
            elif key.char == 'q':
                etat_lecture["quitter"].set(); etat_lecture["pause"].set()
                return False
        elif key == keyboard.Key.right:
            etat_lecture["prochain"].set()
        elif key == keyboard.Key.left:
            etat_lecture["precedent"].set()
            
    listener = keyboard.Listener(on_press=on_press)
    listener.start()
    return listener

# --- FONCTION DE LECTURE PRINCIPALE (v32 - Interactive Stable) ---
def parler(texte, mots_cles, start_index=0):
    global etat_lecture
    paragraphes = re.split(r'\n\s*\n', texte)
    
    listener = controleur_clavier()
    print("\n--- Début de la lecture ---")
    print("Contrôles: [p] Pause/Reprise | [→] Paragraphe suivant | [←] Paragraphe précédent | [q] Quitter")
    
    index_para = start_index
    while index_para < len(paragraphes):
        if etat_lecture["quitter"].is_set(): break
        
        etat_lecture["pause"].wait()

        if etat_lecture["prochain"].is_set():
            index_para = min(index_para + 1, len(paragraphes) - 1)
            etat_lecture["prochain"].clear()
        if etat_lecture["precedent"].is_set():
            index_para = max(index_para - 1, 0)
            etat_lecture["precedent"].clear()

        paragraphe_original = paragraphes[index_para].strip()
        if not paragraphe_original: index_para += 1; continue
        
        fichier_audio_temp = f"temp_audio_{index_para}.mp3"
        fichier_wav_temp = f"temp_audio_{index_para}.wav"
        
        try:
            paragraphe_a_lire = nettoyer_texte_pour_lecture(paragraphe_original)
            texte_terminal = surligner_texte(paragraphe_original, mots_cles)
            
            print(f"\n▶️  {texte_terminal}")

            if not paragraphe_a_lire: index_para += 1; continue

            try: lang = detect(paragraphe_a_lire)
            except LangDetectException: lang = 'fr'
            tts_lang = 'en' if 'en' in lang else 'fr'
            
            tts = gTTS(text=paragraphe_a_lire, lang=tts_lang, slow=False)
            tts.save(fichier_audio_temp)
            
            if os.path.exists(fichier_audio_temp) and os.path.getsize(fichier_audio_temp) > 0:
                sound = AudioSegment.from_mp3(fichier_audio_temp)
                faster_sound = sound.speedup(playback_speed=1.50)
                faster_sound.export(fichier_wav_temp, format="wav")
                
                with open(os.devnull, 'w') as FNULL:
                    etat_lecture["processus_audio"] = subprocess.Popen(["afplay", fichier_wav_temp], stdout=FNULL, stderr=FNULL)
                
                etat_lecture["processus_audio"].wait()
            
            if not any([etat_lecture["prochain"].is_set(), etat_lecture["precedent"].is_set()]):
                 index_para += 1
        except Exception as e:
            print(f"Erreur durant la lecture : {e}")
            index_para += 1
        finally:
            if os.path.exists(fichier_audio_temp): os.remove(fichier_audio_temp)
            if os.path.exists(fichier_wav_temp): os.remove(fichier_wav_temp)
            
    etat_lecture["quitter"].set()
    arreter_son_en_cours()
    listener.stop()
    print("\n\n--- Fin de la session de lecture. ---")
    
    if index_para >= len(paragraphes):
        return 0
    return index_para

# --- POINT D'ENTRÉE PRINCIPAL DU SCRIPT ---
if __name__ == "__main__":
    print("--- Lancement du lecteur de documents intelligent ---")
    
    source_key = selectionner_source_terminal()
    contenu = None
    
    if source_key.startswith('http'):
        contenu = lire_google_doc(source_key)
    elif os.path.exists(source_key):
        contenu = lire_document(source_key)
    else:
        print("Source invalide ou fichier non trouvé. Fin du programme.")
    
    if contenu:
        dossier_exportation = preparer_dossier_export()
        progression = charger_memoire(dossier_exportation, "progression_lecture.json")
        start_index = progression.get(source_key, 0)
        
        if start_index > 0:
            print(f"\nUne progression a été trouvée pour ce document (paragraphe {start_index + 1}).")
            choix_reprise = input("Voulez-vous reprendre la lecture ? (o/n) : ").lower()
            if choix_reprise != 'o':
                start_index = 0
        
        mots_cles_proposes, memoire_initiale = extraire_mots_cles(contenu, dossier_exportation)
        mots_cles_finaux, memoire_mise_a_jour = reviser_mots_cles_interactivement(mots_cles_proposes, memoire_initiale)
        sauvegarder_memoire(dossier_exportation, memoire_mise_a_jour, "memoire_mots_cles.json")
        
        index_final = parler(contenu, mots_cles_finaux, start_index=start_index)
        
        progression[source_key] = index_final
        sauvegarder_memoire(dossier_exportation, progression, "progression_lecture.json")
        print(f"Progression sauvegardée au paragraphe {index_final + 1}.")
        
        print("\n--- Processus terminé. ---")

Pour citer ce code :

Loyer, Dominique. (2024). lect_1907.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

lect20juil.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# -*- coding: utf-8 -*-

# Ce script nécessite l'installation de bibliothèques tierces.
# pip3 install PyPDF2 python-docx gTTS pydub langdetect beautifulsoup4 requests stop-words
#
# IMPORTANT: pydub nécessite FFMPEG. Sur macOS, installez-le avec Homebrew:
# brew install ffmpeg

import os
import re
import collections
from itertools import cycle
import json
import subprocess
import math
import time
import threading

# --- Importations des bibliothèques avec vérification ---
try:
    from pynput import keyboard
except ImportError: print("ERREUR: 'pynput' est requis. `pip3 install pynput`"); exit()
try:
    from gtts import gTTS
except ImportError: print("ERREUR: 'gTTS' est requis. `pip3 install gTTS`"); exit()
try:
    from pydub import AudioSegment
except ImportError: print("ERREUR: 'pydub' est requis. `pip3 install pydub` et `brew install ffmpeg`"); exit()
try:
    from langdetect import detect, LangDetectException
except ImportError: print("ERREUR: 'langdetect' est requis. `pip3 install langdetect`"); exit()
try:
    import requests
except ImportError: print("ERREUR: 'requests' est requis. `pip3 install requests`"); exit()
try:
    from stop_words import get_stop_words
except ImportError: print("ERREUR: 'stop-words' est requis. `pip3 install stop-words`"); exit()
try:
    import PyPDF2
except ImportError: print("AVERTISSEMENT: 'PyPDF2' n'est pas installé."); PyPDF2 = None
try:
    import docx
except ImportError: print("AVERTISSEMENT: 'python-docx' n'est pas installé."); docx = None
try:
    from bs4 import BeautifulSoup
except ImportError: print("AVERTISSEMENT: 'beautifulsoup4' n'est pas installé."); BeautifulSoup = None


# --- Définition des couleurs ---
class Couleurs:
    RESET = '\033[0m'
    BOLD = '\033[1m'
    FG_NOIR = '\033[30m'
    BG_JAUNE = '\033[103m'
    BG_CYAN = '\033[106m'
    BG_MAGENTA = '\033[105m'
    BG_VERT = '\033[102m'
    CITATION_VERT = '\033[92m'

# --- État global de la lecture ---
etat_lecture = {
    "pause": threading.Event(), "quitter": threading.Event(),
    "prochain": threading.Event(), "precedent": threading.Event(),
    "processus_audio": None
}
etat_lecture["pause"].set()

# --- Fonctions de lecture de fichiers ---
def lire_document(chemin_fichier):
    nom_fichier, extension = os.path.splitext(chemin_fichier)
    extension = extension.lower()
    print(f"\n--- Début de l'extraction du texte : {os.path.basename(chemin_fichier)} ---")
    if extension in ['.txt', '.tex']: return lire_fichier_txt(chemin_fichier)
    elif extension == '.pdf': return lire_fichier_pdf(chemin_fichier)
    elif extension == '.docx': return lire_fichier_docx(chemin_fichier)
    elif extension == '.html': return lire_fichier_html(chemin_fichier)
    else: print(f"Erreur : Type de fichier '{extension}' non supporté."); return None

def lire_fichier_txt(chemin_fichier):
    try:
        with open(chemin_fichier, 'r', encoding='utf-8') as f: return f.read()
    except Exception as e: print(f"Erreur lecture TXT: {e}"); return None
def lire_fichier_pdf(chemin_fichier):
    if not PyPDF2: print("ERREUR: PyPDF2 n'est pas disponible."); return None
    texte_complet = ""
    try:
        with open(chemin_fichier, 'rb') as f:
            lecteur_pdf = PyPDF2.PdfReader(f)
            for page in lecteur_pdf.pages: texte_complet += page.extract_text() or ""
        return texte_complet
    except Exception as e: print(f"Erreur lecture PDF: {e}"); return None
def lire_fichier_docx(chemin_fichier):
    if not docx: print("ERREUR: python-docx n'est pas disponible."); return None
    texte_complet = []
    try:
        document = docx.Document(chemin_fichier)
        for p in document.paragraphs: texte_complet.append(p.text)
        return '\n'.join(texte_complet)
    except Exception as e: print(f"Erreur lecture DOCX: {e}"); return None
def lire_fichier_html(chemin_fichier):
    if not BeautifulSoup: print("ERREUR: beautifulsoup4 n'est pas disponible."); return None
    try:
        with open(chemin_fichier, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
            for script_or_style in soup(["script", "style"]): script_or_style.decompose()
            texte = soup.get_text()
            lignes = (line.strip() for line in texte.splitlines())
            morceaux = (phrase.strip() for line in lignes for phrase in line.split("  "))
            return '\n'.join(chunk for chunk in morceaux if chunk)
    except Exception as e: print(f"Erreur lecture HTML: {e}"); return None
def lire_google_doc(url):
    print("--- Tentative de lecture du Google Doc... ---")
    try:
        match = re.search(r'/document/d/([a-zA-Z0-9-_]+)', url)
        if not match: print("ERREUR: URL de Google Doc invalide."); return None
        doc_id = match.group(1)
        export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
        print(f"--- Téléchargement du contenu depuis Google Docs... ---")
        response = requests.get(export_url, timeout=15)
        response.raise_for_status()
        return response.content.decode('utf-8-sig')
    except requests.exceptions.RequestException as e:
        print(f"ERREUR: Impossible de télécharger le Google Doc. Vérifiez le lien et le partage.\n{e}"); return None

def selectionner_source_terminal():
    print("\n" + "="*60)
    print("▶️  Glissez-déposez un fichier OU collez un lien Google Doc,")
    print("   puis appuyez sur la touche [Entrée].")
    print("="*60)
    chemin_brut = input()
    return chemin_brut.strip().strip("'").strip('"')

# --- Fonctions de mémoire et d'extraction des mots-clés ---
def preparer_dossier_export():
    dossier_export = os.path.join(os.path.expanduser("~"), "Desktop", "Lecteur_Exports")
    os.makedirs(dossier_export, exist_ok=True)
    return dossier_export

def charger_memoire(dossier_export, nom_fichier):
    chemin_memoire = os.path.join(dossier_export, nom_fichier)
    if os.path.exists(chemin_memoire):
        with open(chemin_memoire, 'r', encoding='utf-8') as f: return json.load(f)
    return {}

def sauvegarder_memoire(dossier_export, memoire, nom_fichier):
    chemin_memoire = os.path.join(dossier_export, nom_fichier)
    with open(chemin_memoire, 'w', encoding='utf-8') as f: json.dump(memoire, f, indent=4)

def extraire_mots_cles(texte, dossier_export, nombre_mots=100):
    memoire = charger_memoire(dossier_export, "memoire_mots_cles.json")
    print("🧠 Mémoire des mots-clés chargée.")
    stopwords = set(get_stop_words('fr') + get_stop_words('en'))
    noms_a_ignorer = set(['dominique loyer', 'dominique s. loyer'])
    noms_propres = re.findall(r'(?<![.\s])\b[A-Z][a-z]{3,}\b', texte)
    mots = re.findall(r'\b\w+\b', texte.lower())
    mots_filtres = [mot for mot in mots if mot not in stopwords and len(mot) > 3]
    compteur_actuel = collections.Counter(mots_filtres)
    for mot, freq in compteur_actuel.items():
        if mot not in noms_a_ignorer:
            memoire[mot] = memoire.get(mot, 0) + freq
    mots_cles_memoire = sorted(memoire, key=memoire.get, reverse=True)
    mots_cles_combines = list(dict.fromkeys(noms_propres + mots_cles_memoire))
    mots_cles_finaux = [mot for mot in mots_cles_combines if mot.lower() not in stopwords and mot.lower() not in noms_a_ignorer][:nombre_mots]
    return mots_cles_finaux, memoire

def reviser_mots_cles_interactivement(mots_cles_proposes, memoire):
    mots_cles_approuves = list(mots_cles_proposes)
    while True:
        print("\n" + "="*80 + "\n✍️  ATELIER DE MOTS-CLÉS INTERACTIF\n" + "="*80)
        num_mots = len(mots_cles_approuves)
        num_cols = 4
        num_rows = math.ceil(num_mots / num_cols)
        col_width = 20
        for i in range(num_rows):
            line = ""
            for j in range(num_cols):
                index = i + j * num_rows
                if index < num_mots:
                    mot = f"{index+1:2d}. {mots_cles_approuves[index]}"
                    line += mot.ljust(col_width)
            print(line)
        print("\nCommandes:\n  - Numéro pour DÉSAPPROUVER (ex: 5)\n  - '+' suivi d'un mot pour AJOUTER (ex: +cybersécurité)\n  - 'ok' ou [Entrée] pour VALIDER.")
        choix = input("\nVotre choix : ").strip()
        if not choix or choix.lower() == 'ok': break
        elif choix.startswith('+'):
            nouveau_mot = choix[1:].strip().lower()
            if nouveau_mot and nouveau_mot not in mots_cles_approuves:
                mots_cles_approuves.append(nouveau_mot)
                memoire[nouveau_mot] = memoire.get(nouveau_mot, 0) + 100
                print(f"✅ '{nouveau_mot}' ajouté et promu.")
        elif choix.isdigit():
            try:
                index = int(choix) - 1
                if 0 <= index < len(mots_cles_approuves):
                    mot_retire = mots_cles_approuves.pop(index)
                    memoire[mot_retire] = max(0, memoire.get(mot_retire, 0) - 50)
                    print(f"❌ '{mot_retire}' retiré et rétrogradé.")
                else: print("Numéro invalide.")
            except ValueError: print("Entrée non reconnue.")
        else: print("Commande invalide.")
    print(f"\n--- {len(mots_cles_approuves)} Mots-clés finaux validés. ---")
    return mots_cles_approuves, memoire

def surligner_texte(texte, mots_cles):
    texte_colore = texte
    paires_couleurs = [(Couleurs.FG_NOIR, Couleurs.BG_JAUNE), (Couleurs.FG_NOIR, Couleurs.BG_CYAN), (Couleurs.FG_NOIR, Couleurs.BG_MAGENTA), (Couleurs.FG_NOIR, Couleurs.BG_VERT)]
    cycle_couleurs = cycle(paires_couleurs)
    mots_tries = sorted(mots_cles, key=len, reverse=True)
    for mot in mots_tries:
        couleur_texte, couleur_fond = next(cycle_couleurs)
        pattern = r'\b(' + re.escape(mot) + r')\b'
        texte_colore = re.sub(pattern, f"{Couleurs.BOLD}{couleur_fond}{couleur_texte}\\1{Couleurs.RESET}", texte_colore, flags=re.IGNORECASE)
    return texte_colore

def nettoyer_texte_pour_lecture(texte):
    texte_nettoye = texte.replace('-', ' ')
    texte_nettoye = re.sub(r'\(.*?\)', '', texte_nettoye)
    texte_nettoye = re.sub(r'\{.*?\}', '', texte_nettoye)
    texte_nettoye = re.sub(r'\s*\([^)]+\d{4}[^)]*\)', '', texte_nettoye)
    texte_nettoye = re.sub(r'Dominique\s+S?\.\s+Loyer', '', texte_nettoye, flags=re.IGNORECASE)
    return texte_nettoye.strip()

# --- Contrôleur de clavier et lecture audio fiables ---
def arreter_son_en_cours():
    if etat_lecture["processus_audio"] and etat_lecture["processus_audio"].poll() is None:
        etat_lecture["processus_audio"].terminate()
        etat_lecture["processus_audio"].wait()

def controleur_clavier():
    def on_press(key):
        global etat_lecture
        if etat_lecture["quitter"].is_set(): return False
        
        action_detectee = False
        if hasattr(key, 'char'):
            if key.char in ['p', 'q']: action_detectee = True
        elif key in [keyboard.Key.right, keyboard.Key.left]: action_detectee = True
        
        if action_detectee: arreter_son_en_cours()

        if hasattr(key, 'char'):
            if key.char == 'p':
                if etat_lecture["pause"].is_set():
                    print(f"\n--- REPRISE ---", end="", flush=True)
                    etat_lecture["pause"].clear()
                else:
                    print(f"\n--- EN PAUSE ---", end="", flush=True)
                    etat_lecture["pause"].set()
            elif key.char == 'q':
                etat_lecture["quitter"].set(); etat_lecture["pause"].set()
                return False
        elif key == keyboard.Key.right:
            etat_lecture["prochain"].set()
        elif key == keyboard.Key.left:
            etat_lecture["precedent"].set()
            
    listener = keyboard.Listener(on_press=on_press)
    listener.start()
    return listener

# --- FONCTION DE LECTURE PRINCIPALE (v32 - Interactive Stable) ---
def parler(texte, mots_cles, start_index=0):
    global etat_lecture
    paragraphes = re.split(r'\n\s*\n', texte)
    
    listener = controleur_clavier()
    print("\n--- Début de la lecture ---")
    print("Contrôles: [p] Pause/Reprise | [→] Paragraphe suivant | [←] Paragraphe précédent | [q] Quitter")
    
    index_para = start_index
    while index_para < len(paragraphes):
        if etat_lecture["quitter"].is_set(): break
        
        etat_lecture["pause"].wait()

        if etat_lecture["prochain"].is_set():
            index_para = min(index_para + 1, len(paragraphes) - 1)
            etat_lecture["prochain"].clear()
        if etat_lecture["precedent"].is_set():
            index_para = max(index_para - 1, 0)
            etat_lecture["precedent"].clear()

        paragraphe_original = paragraphes[index_para].strip()
        if not paragraphe_original: index_para += 1; continue
        
        fichier_audio_temp = f"temp_audio_{index_para}.mp3"
        fichier_wav_temp = f"temp_audio_{index_para}.wav"
        
        try:
            paragraphe_a_lire = nettoyer_texte_pour_lecture(paragraphe_original)
            texte_terminal = surligner_texte(paragraphe_original, mots_cles)
            
            print(f"\n▶️  {texte_terminal}")

            if not paragraphe_a_lire: index_para += 1; continue

            try: lang = detect(paragraphe_a_lire)
            except LangDetectException: lang = 'fr'
            tts_lang = 'en' if 'en' in lang else 'fr'
            
            tts = gTTS(text=paragraphe_a_lire, lang=tts_lang, slow=False)
            tts.save(fichier_audio_temp)
            
            if os.path.exists(fichier_audio_temp) and os.path.getsize(fichier_audio_temp) > 0:
                sound = AudioSegment.from_mp3(fichier_audio_temp)
                faster_sound = sound.speedup(playback_speed=1.50)
                faster_sound.export(fichier_wav_temp, format="wav")
                
                with open(os.devnull, 'w') as FNULL:
                    etat_lecture["processus_audio"] = subprocess.Popen(["afplay", fichier_wav_temp], stdout=FNULL, stderr=FNULL)
                
                etat_lecture["processus_audio"].wait()
            
            if not any([etat_lecture["prochain"].is_set(), etat_lecture["precedent"].is_set()]):
                 index_para += 1
        except Exception as e:
            print(f"Erreur durant la lecture : {e}")
            index_para += 1
        finally:
            if os.path.exists(fichier_audio_temp): os.remove(fichier_audio_temp)
            if os.path.exists(fichier_wav_temp): os.remove(fichier_wav_temp)
            
    etat_lecture["quitter"].set()
    arreter_son_en_cours()
    listener.stop()
    print("\n\n--- Fin de la session de lecture. ---")
    
    if index_para >= len(paragraphes):
        return 0
    return index_para

# --- POINT D'ENTRÉE PRINCIPAL DU SCRIPT ---
if __name__ == "__main__":
    print("--- Lancement du lecteur de documents intelligent ---")
    
    source_key = selectionner_source_terminal()
    contenu = None
    
    if source_key.startswith('http'):
        contenu = lire_google_doc(source_key)
    elif os.path.exists(source_key):
        contenu = lire_document(source_key)
    else:
        print("Source invalide ou fichier non trouvé. Fin du programme.")
    
    if contenu:
        dossier_exportation = preparer_dossier_export()
        progression = charger_memoire(dossier_exportation, "progression_lecture.json")
        start_index = progression.get(source_key, 0)
        
        if start_index > 0:
            print(f"\nUne progression a été trouvée pour ce document (paragraphe {start_index + 1}).")
            choix_reprise = input("Voulez-vous reprendre la lecture ? (o/n) : ").lower()
            if choix_reprise != 'o':
                start_index = 0
        
        mots_cles_proposes, memoire_initiale = extraire_mots_cles(contenu, dossier_exportation)
        mots_cles_finaux, memoire_mise_a_jour = reviser_mots_cles_interactivement(mots_cles_proposes, memoire_initiale)
        sauvegarder_memoire(dossier_exportation, memoire_mise_a_jour, "memoire_mots_cles.json")
        
        index_final = parler(contenu, mots_cles_finaux, start_index=start_index)
        
        progression[source_key] = index_final
        sauvegarder_memoire(dossier_exportation, progression, "progression_lecture.json")
        print(f"Progression sauvegardée au paragraphe {index_final + 1}.")
        
        print("\n--- Processus terminé. ---")

Pour citer ce code :

Loyer, Dominique. (2024). lect20juil.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

misc 1.R

Ce script R réalise une analyse de sentiment sur des tweets, incluant l'extraction, le nettoyage et la visualisation par nuage de mots, et effectue également l'exploration, le nettoyage et la modélisation prédictive (arbres de décision, réseaux neuronaux) sur des données de recensement pour la prédiction de revenus.

Mots-clés: Analyse sentiment, Données textuelles, Traitement données, Arbre décision, Réseau neuronal

[[R]]

[[mes codes R|Setting Working Directory]]

[[twitter_sentiment_analysis_2016|Setting Working Directory]]##**********Steps to Set up authorization to connect and extract tweets********


### Setting Working Directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Sentiment_Accelerator")
# tweets

``` R

[[tweets]] 

library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentimentanalysis)
library(curl)
library(syuzhet)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### Twitter Application
consumerKey="****"
consumerSecret="K****"
accesstoken="****"
accesssecret="****"

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC

save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumerKey, consumerSecret, accesstoken, accesssecret )




##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = search_tweets("#rstudioconf", n=100000, include_rts =FALSE)

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(mysentiment[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")



-------------------------
  
  
  
  [[Technology]]: I used R for the analysis (here is the code) 

##Auteur: Dominique Loyer


[[Question]] 1 and 2 and exploration of data

Census <- read.csv("~/Desktop/CensusClean.csv")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- readxl::read_excel("Census.xlsx")
pwc <- read("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)


[[Question]] 3

##Cleaningmytreeadult

###categorical variables as factor
pwcClean$workclass <- as.factor(pwcClean$workclass)
pwcClean$`Income Group` <- as.factor(pwcClean$`Income Group`)
pwcClean$Education <- as.factor(pwcClean$Education)
pwcClean$`Occupation Status` <- as.factor(pwcClean$`Occupation Status`)
pwcClean$Relationship <- as.factor(pwcClean$Relationship)
pwcClean$Gender <- as.factor(pwcClean$Gender)
pwcClean$`Native country` <- as.factor(pwcClean$`Native country`)
summary(pwcClean)
pwcClean$`Marital Status` <- as.factor(pwcClean$`Marital Status`)
summary(pwcClean)
pwcClean$Age=(pwcClean$Age-min(pwcClean$Age))/(max(pwcClean$Age)-min(pwcClean$Age))

range(pwcClean$`Demographic Adjustment`)
hist(pwcClean$`Demographic Adjustment`)

###replacing missing values and merging Income Group (50k and 50k.)

install.packages("tidyr")
library("tidyr")
pwc$`Occupation Status` <- sub("?", "Other-service", pwc$`Occupation Status`)
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
pwc$workclass[pwc$workclass==" ?"] = as.character(sample(pwc$workclass[which(pwc$workclass !=" ?")], 1774, replace = FALSE))
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50less <- subset (pwc, `Income Group` %in% "<=50K")



###normalization with Min-Max
pwcClean$`Demographic Adjustment`=(pwcClean$`Demographic Adjustment`-min(pwcClean$`Demographic Adjustment`))/(max(pwcClean$`Demographic Adjustment`)-min(pwcClean$`Demographic Adjustment`))
hist(pwcClean$`Demographic Adjustment`)
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education`))
pwcClean$`capital-gain`=(pwcClean$`capital-gain`-min(pwcClean$`capital-gain`))/(max(pwcClean$`capital-gain`)-min(pwcClean$`capital-gain`))
pwcClean$`capital-loss`=(pwcClean$`capital-loss`-min(pwcClean$`capital-loss`))/(max(pwcClean$`capital-loss`)-min(pwcClean$`capital-loss`))
pwcClean$`hours-per-week`=(pwcClean$`hours-per-week`-min(pwcClean$`hours-per-week`))/(max(pwcClean$`hours-per-week`)-min(pwcClean$`hours-per-week`))

###Normalized and cleaned data
pwcClean.normalized <- pwcClean
pwcClean.normalizedCopy <- pwcClean.normalized



[[Decision]] Tree
##pruning the tree with rpart
install.packages("rpart")
library(rpart)
library(rpart.plot)
mytreeadult=rpart(`Income Group`~Education+workclass+`Occupation Status`+Gender+Age+`hours-per-week`
                  , data=pwcClean.normalized, method="class", control=rpart.control(minsplit=1))
mytreeadult
plot(mytreeadult)
rpart.plot(mytreeadult, type=3, extra=101, fallen.leaves = TRUE)
text(mytreeadult, use.n=T, all=T, pretty=0, cex=0.9, xpd=TRUE)
estincome.class=predict(mytreeadult, data=pwcClean.normalized, type="class")

##cross-validation table

t1=table(`Income Group`, estincome.class)
t1
(10061+1354)+(11564+1505)/30511
(10061+1354+11564+1505)/30511



###Training data 20000 out of 30511
index <- sample(1:nrow(pwcClean.normalized), 20000)
pwc.test = pwcClean.normalized[index,]
pwc.valid = pwcClean.normalized[-index,]

###validation data
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc
(7457+1519)/(30511-20000)

[[Neural]] network with 10 hidden layers

require(nnet)
library(nnet)
set.seed(99999999)

pwc.net = nnet(`Income Group`~.,data=pwc.test,size=10)
pwc.valid$est.income = predict(pwc.net,pwc.valid,type="class")
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc


clear(
  
)
rpart(formula, data=, method=,control=)
###
rpart(formula, data=, method=,control=)
fit<-rpart(Default_On_Payment~Status_Checking_Acc+Duration_in_Months+Credit_History+Purposre_Credit_Taken+
             Credit_Amount+Savings_Acc+Years_At_Present_Employment+Inst_Rt_Income+Marital_Status_Gender+
             Other_Debtors_Guarantors+Current_Address_Yrs+Property+Age+Other_Inst_Plans+Housing+
             Num_CC+Job+Dependents+Telephone+Foreign_Worker,
           data=cust_data, method="class",
           control=rpart.control(minsplit=20, cp=0.01))



-------------------------
  
  
  [[PWC]]

2044 master and doctorate plus grand que 50k 1181 = 3,87%
23265 plus petit ou égal 50k


missing data
workclass 1769
occupation Status 1774
native country 531	

Census <- read_excel("~/Desktop/PwC/Census.xlsx")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- read_excel("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)
hist(pwc)
subset (pwc, Age <=18)
subset (pwc, Age <=15)
subset (pwc, Age <=16)
subset (pwc, Age <=17)



-------------------
  
  [[tweets]]

##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("/Users/SherbrookeInformatique/Bureau")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentiment)
library(RCurl)
library(syuzhet)
library(sentimentr)
# load twitter library - the rtweet library is recommended now over twitteR
library(rtweet)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)

install.packages("httpuv")
library(httpuv)



### Fill out my tokens
consumer_key=""
consumer_secret=""
access_token=""
access_secret=""



setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
ytw = twitteR::searchTwitter('#realDonaldTrump + [[HillaryClinton]]', n = 6, since = '2016-11-08', retryOnRateLimit = 1e3)
d = twitteR::twListToDF(tw)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### put my credientials here
consumer_key=""
consumer_secret=""
access_token=""
access_token_secret=""

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC


Elec = search_tweets("#Election2020", n=100000, lang='en', include_rts=FALSE)
Elec





save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumer_key=consumerKey, consumer_secret=consumerSecret, access_token =accesstoken, access_secret = accesssecret )



##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = searchTwitter("Election2020", n=1000, since = "2020-11-03", lang= "en")

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

y# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(some_txt5[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")






---------------------
  
  [[tweets]] 

##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Sentiment_Accelerator")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentimentanalysis)
library(curl)
library(syuzhet)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### Twitter Application
consumerKey="U9PZprvQsTHlEAhr87oyc0mhY"
consumerSecret="KphaxCtCh5XDw0t4e4JCJ7pv3R7lPnRQ10PYgwEkEPJPdo0UX0"
accesstoken="1084536220141740035-ywa1yvmxzPni8Kt8pjmxVP07gmz253"
accesssecret="Kkk2CQVGWynjJfGsXgZsQiXRuBIHWfC8gHVZa5DcQtbgLg"

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC

save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumerKey, consumerSecret, accesstoken, accesssecret )




##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = search_tweets("#rstudioconf", n=1000, include_rts =FALSE)

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data


mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(mysentiment[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")

```
# PWC

```R
[[Technology]]: I used R for the analysis (here is the code) 

##Auteur: Dominique Loyer


[[Question]] 1 and 2 and exploration of data

Census <- read.csv("~/Desktop/CensusClean.csv")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- readxl::read_excel("Census.xlsx")
pwc <- read("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)


[[Question]] 3

##Cleaningmytreeadult

###categorical variables as factor
pwcClean$workclass <- as.factor(pwcClean$workclass)
pwcClean$`Income Group` <- as.factor(pwcClean$`Income Group`)
pwcClean$Education <- as.factor(pwcClean$Education)
pwcClean$`Occupation Status` <- as.factor(pwcClean$`Occupation Status`)
pwcClean$Relationship <- as.factor(pwcClean$Relationship)
pwcClean$Gender <- as.factor(pwcClean$Gender)
pwcClean$`Native country` <- as.factor(pwcClean$`Native country`)
summary(pwcClean)
pwcClean$`Marital Status` <- as.factor(pwcClean$`Marital Status`)
summary(pwcClean)
pwcClean$Age=(pwcClean$Age-min(pwcClean$Age))/(max(pwcClean$Age)-min(pwcClean$Age))

range(pwcClean$`Demographic Adjustment`)
hist(pwcClean$`Demographic Adjustment`)

###replacing missing values and merging Income Group (50k and 50k.)

install.packages("tidyr")
library("tidyr")
pwc$`Occupation Status` <- sub("?", "Other-service", pwc$`Occupation Status`)
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
pwc$workclass[pwc$workclass==" ?"] = as.character(sample(pwc$workclass[which(pwc$workclass !=" ?")], 1774, replace = FALSE))
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50less <- subset (pwc, `Income Group` %in% "<=50K")



###normalization with Min-Max
pwcClean$`Demographic Adjustment`=(pwcClean$`Demographic Adjustment`-min(pwcClean$`Demographic Adjustment`))/(max(pwcClean$`Demographic Adjustment`)-min(pwcClean$`Demographic Adjustment`))
hist(pwcClean$`Demographic Adjustment`)
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education`))
pwcClean$`capital-gain`=(pwcClean$`capital-gain`-min(pwcClean$`capital-gain`))/(max(pwcClean$`capital-gain`)-min(pwcClean$`capital-gain`))
pwcClean$`capital-loss`=(pwcClean$`capital-loss`-min(pwcClean$`capital-loss`))/(max(pwcClean$`capital-loss`)-min(pwcClean$`capital-loss`))
pwcClean$`hours-per-week`=(pwcClean$`hours-per-week`-min(pwcClean$`hours-per-week`))/(max(pwcClean$`hours-per-week`)-min(pwcClean$`hours-per-week`))

###Normalized and cleaned data
pwcClean.normalized <- pwcClean
pwcClean.normalizedCopy <- pwcClean.normalized



[[Decision]] Tree
##pruning the tree with rpart
install.packages("rpart")
library(rpart)
library(rpart.plot)
mytreeadult=rpart(`Income Group`~Education+workclass+`Occupation Status`+Gender+Age+`hours-per-week`
                  , data=pwcClean.normalized, method="class", control=rpart.control(minsplit=1))
mytreeadult
plot(mytreeadult)
rpart.plot(mytreeadult, type=3, extra=101, fallen.leaves = TRUE)
text(mytreeadult, use.n=T, all=T, pretty=0, cex=0.9, xpd=TRUE)
estincome.class=predict(mytreeadult, data=pwcClean.normalized, type="class")

##cross-validation table

t1=table(`Income Group`, estincome.class)
t1
(10061+1354)+(11564+1505)/30511
(10061+1354+11564+1505)/30511



###Training data 20000 out of 30511
index <- sample(1:nrow(pwcClean.normalized), 20000)
pwc.test = pwcClean.normalized[index,]
pwc.valid = pwcClean.normalized[-index,]

###validation data
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc
(7457+1519)/(30511-20000)

[[Neural]] network with 10 hidden layers

require(nnet)
library(nnet)
set.seed(99999999)

pwc.net = nnet(`Income Group`~.,data=pwc.test,size=10)
pwc.valid$est.income = predict(pwc.net,pwc.valid,type="class")
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc


clear(
  
)
rpart(formula, data=, method=,control=)
###
rpart(formula, data=, method=,control=)
fit<-rpart(Default_On_Payment~Status_Checking_Acc+Duration_in_Months+Credit_History+Purposre_Credit_Taken+
             Credit_Amount+Savings_Acc+Years_At_Present_Employment+Inst_Rt_Income+Marital_Status_Gender+
             Other_Debtors_Guarantors+Current_Address_Yrs+Property+Age+Other_Inst_Plans+Housing+
             Num_CC+Job+Dependents+Telephone+Foreign_Worker,
           data=cust_data, method="class",
           control=rpart.control(minsplit=20, cp=0.01))











```
# Tweets Word Cloud
```R
##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Sentiment_Accelerator")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentimentanalysis)
library(curl)
library(syuzhet)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### Twitter Application
consumerKey="****"
consumerSecret="K****"
accesstoken="****"
accesssecret="****"

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC

save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumerKey, consumerSecret, accesstoken, accesssecret )




##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = search_tweets("#rstudioconf", n=100000, include_rts =FALSE)

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(mysentiment[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")

```


Pour citer ce code :

Loyer, Dominique. (2024). misc 1.R [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

misc.R

Ce code R effectue une analyse de sentiment sur des tweets, incluant l'extraction et le nettoyage de texte, et modélise des données démographiques (type recensement) pour prédire des revenus à l'aide d'arbres de décision et de réseaux neuronaux.

Mots-clés: Analyse sentiment, Twitter, Prédiction revenus, Apprentissage automatique, Nettoyage données

[[R]]

[[mes codes R|Setting Working Directory]]

[[twitter_sentiment_analysis_2016|Setting Working Directory]]##**********Steps to Set up authorization to connect and extract tweets********


### Setting Working Directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Sentiment_Accelerator")
# tweets

``` R

[[tweets]] 

library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentimentanalysis)
library(curl)
library(syuzhet)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### Twitter Application
consumerKey="****"
consumerSecret="K****"
accesstoken="****"
accesssecret="****"

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC

save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumerKey, consumerSecret, accesstoken, accesssecret )




##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = search_tweets("#rstudioconf", n=100000, include_rts =FALSE)

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(mysentiment[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")



-------------------------
  
  
  
  [[Technology]]: I used R for the analysis (here is the code) 

##Auteur: Dominique Loyer


[[Question]] 1 and 2 and exploration of data

Census <- read.csv("~/Desktop/CensusClean.csv")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- readxl::read_excel("Census.xlsx")
pwc <- read("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)


[[Question]] 3

##Cleaningmytreeadult

###categorical variables as factor
pwcClean$workclass <- as.factor(pwcClean$workclass)
pwcClean$`Income Group` <- as.factor(pwcClean$`Income Group`)
pwcClean$Education <- as.factor(pwcClean$Education)
pwcClean$`Occupation Status` <- as.factor(pwcClean$`Occupation Status`)
pwcClean$Relationship <- as.factor(pwcClean$Relationship)
pwcClean$Gender <- as.factor(pwcClean$Gender)
pwcClean$`Native country` <- as.factor(pwcClean$`Native country`)
summary(pwcClean)
pwcClean$`Marital Status` <- as.factor(pwcClean$`Marital Status`)
summary(pwcClean)
pwcClean$Age=(pwcClean$Age-min(pwcClean$Age))/(max(pwcClean$Age)-min(pwcClean$Age))

range(pwcClean$`Demographic Adjustment`)
hist(pwcClean$`Demographic Adjustment`)

###replacing missing values and merging Income Group (50k and 50k.)

install.packages("tidyr")
library("tidyr")
pwc$`Occupation Status` <- sub("?", "Other-service", pwc$`Occupation Status`)
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
pwc$workclass[pwc$workclass==" ?"] = as.character(sample(pwc$workclass[which(pwc$workclass !=" ?")], 1774, replace = FALSE))
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50less <- subset (pwc, `Income Group` %in% "<=50K")



###normalization with Min-Max
pwcClean$`Demographic Adjustment`=(pwcClean$`Demographic Adjustment`-min(pwcClean$`Demographic Adjustment`))/(max(pwcClean$`Demographic Adjustment`)-min(pwcClean$`Demographic Adjustment`))
hist(pwcClean$`Demographic Adjustment`)
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education`))
pwcClean$`capital-gain`=(pwcClean$`capital-gain`-min(pwcClean$`capital-gain`))/(max(pwcClean$`capital-gain`)-min(pwcClean$`capital-gain`))
pwcClean$`capital-loss`=(pwcClean$`capital-loss`-min(pwcClean$`capital-loss`))/(max(pwcClean$`capital-loss`)-min(pwcClean$`capital-loss`))
pwcClean$`hours-per-week`=(pwcClean$`hours-per-week`-min(pwcClean$`hours-per-week`))/(max(pwcClean$`hours-per-week`)-min(pwcClean$`hours-per-week`))

###Normalized and cleaned data
pwcClean.normalized <- pwcClean
pwcClean.normalizedCopy <- pwcClean.normalized



[[Decision]] Tree
##pruning the tree with rpart
install.packages("rpart")
library(rpart)
library(rpart.plot)
mytreeadult=rpart(`Income Group`~Education+workclass+`Occupation Status`+Gender+Age+`hours-per-week`
                  , data=pwcClean.normalized, method="class", control=rpart.control(minsplit=1))
mytreeadult
plot(mytreeadult)
rpart.plot(mytreeadult, type=3, extra=101, fallen.leaves = TRUE)
text(mytreeadult, use.n=T, all=T, pretty=0, cex=0.9, xpd=TRUE)
estincome.class=predict(mytreeadult, data=pwcClean.normalized, type="class")

##cross-validation table

t1=table(`Income Group`, estincome.class)
t1
(10061+1354)+(11564+1505)/30511
(10061+1354+11564+1505)/30511



###Training data 20000 out of 30511
index <- sample(1:nrow(pwcClean.normalized), 20000)
pwc.test = pwcClean.normalized[index,]
pwc.valid = pwcClean.normalized[-index,]

###validation data
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc
(7457+1519)/(30511-20000)

[[Neural]] network with 10 hidden layers

require(nnet)
library(nnet)
set.seed(99999999)

pwc.net = nnet(`Income Group`~.,data=pwc.test,size=10)
pwc.valid$est.income = predict(pwc.net,pwc.valid,type="class")
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc


clear(
  
)
rpart(formula, data=, method=,control=)
###
rpart(formula, data=, method=,control=)
fit<-rpart(Default_On_Payment~Status_Checking_Acc+Duration_in_Months+Credit_History+Purposre_Credit_Taken+
             Credit_Amount+Savings_Acc+Years_At_Present_Employment+Inst_Rt_Income+Marital_Status_Gender+
             Other_Debtors_Guarantors+Current_Address_Yrs+Property+Age+Other_Inst_Plans+Housing+
             Num_CC+Job+Dependents+Telephone+Foreign_Worker,
           data=cust_data, method="class",
           control=rpart.control(minsplit=20, cp=0.01))



-------------------------
  
  
  [[PWC]]

2044 master and doctorate plus grand que 50k 1181 = 3,87%
23265 plus petit ou égal 50k


missing data
workclass 1769
occupation Status 1774
native country 531	

Census <- read_excel("~/Desktop/PwC/Census.xlsx")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- read_excel("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)
hist(pwc)
subset (pwc, Age <=18)
subset (pwc, Age <=15)
subset (pwc, Age <=16)
subset (pwc, Age <=17)



-------------------
  
  [[tweets]]

##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("/Users/SherbrookeInformatique/Bureau")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentiment)
library(RCurl)
library(syuzhet)
library(sentimentr)
# load twitter library - the rtweet library is recommended now over twitteR
library(rtweet)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)

install.packages("httpuv")
library(httpuv)



### Fill out my tokens
consumer_key=""
consumer_secret=""
access_token=""
access_secret=""



setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
ytw = twitteR::searchTwitter('#realDonaldTrump + [[HillaryClinton]]', n = 6, since = '2016-11-08', retryOnRateLimit = 1e3)
d = twitteR::twListToDF(tw)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### put my credientials here
consumer_key=""
consumer_secret=""
access_token=""
access_token_secret=""

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC


Elec = search_tweets("#Election2020", n=100000, lang='en', include_rts=FALSE)
Elec





save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumer_key=consumerKey, consumer_secret=consumerSecret, access_token =accesstoken, access_secret = accesssecret )



##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = searchTwitter("Election2020", n=1000, since = "2020-11-03", lang= "en")

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

y# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(some_txt5[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")






---------------------
  
  [[tweets]] 

##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Sentiment_Accelerator")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentimentanalysis)
library(curl)
library(syuzhet)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### Twitter Application
consumerKey="U9PZprvQsTHlEAhr87oyc0mhY"
consumerSecret="KphaxCtCh5XDw0t4e4JCJ7pv3R7lPnRQ10PYgwEkEPJPdo0UX0"
accesstoken="1084536220141740035-ywa1yvmxzPni8Kt8pjmxVP07gmz253"
accesssecret="Kkk2CQVGWynjJfGsXgZsQiXRuBIHWfC8gHVZa5DcQtbgLg"

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC

save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumerKey, consumerSecret, accesstoken, accesssecret )




##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = search_tweets("#rstudioconf", n=1000, include_rts =FALSE)

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data


mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(mysentiment[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")

```
# PWC

```R
[[Technology]]: I used R for the analysis (here is the code) 

##Auteur: Dominique Loyer


[[Question]] 1 and 2 and exploration of data

Census <- read.csv("~/Desktop/CensusClean.csv")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- readxl::read_excel("Census.xlsx")
pwc <- read("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)


[[Question]] 3

##Cleaningmytreeadult

###categorical variables as factor
pwcClean$workclass <- as.factor(pwcClean$workclass)
pwcClean$`Income Group` <- as.factor(pwcClean$`Income Group`)
pwcClean$Education <- as.factor(pwcClean$Education)
pwcClean$`Occupation Status` <- as.factor(pwcClean$`Occupation Status`)
pwcClean$Relationship <- as.factor(pwcClean$Relationship)
pwcClean$Gender <- as.factor(pwcClean$Gender)
pwcClean$`Native country` <- as.factor(pwcClean$`Native country`)
summary(pwcClean)
pwcClean$`Marital Status` <- as.factor(pwcClean$`Marital Status`)
summary(pwcClean)
pwcClean$Age=(pwcClean$Age-min(pwcClean$Age))/(max(pwcClean$Age)-min(pwcClean$Age))

range(pwcClean$`Demographic Adjustment`)
hist(pwcClean$`Demographic Adjustment`)

###replacing missing values and merging Income Group (50k and 50k.)

install.packages("tidyr")
library("tidyr")
pwc$`Occupation Status` <- sub("?", "Other-service", pwc$`Occupation Status`)
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
pwc$workclass[pwc$workclass==" ?"] = as.character(sample(pwc$workclass[which(pwc$workclass !=" ?")], 1774, replace = FALSE))
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50less <- subset (pwc, `Income Group` %in% "<=50K")



###normalization with Min-Max
pwcClean$`Demographic Adjustment`=(pwcClean$`Demographic Adjustment`-min(pwcClean$`Demographic Adjustment`))/(max(pwcClean$`Demographic Adjustment`)-min(pwcClean$`Demographic Adjustment`))
hist(pwcClean$`Demographic Adjustment`)
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education))
pwcClean$`Years of Education`=(pwcClean$`Years of Education`-min(pwcClean$`Years of Education`))/(max(pwcClean$`Years of Education`)-min(pwcClean$`Years of Education`))
pwcClean$`capital-gain`=(pwcClean$`capital-gain`-min(pwcClean$`capital-gain`))/(max(pwcClean$`capital-gain`)-min(pwcClean$`capital-gain`))
pwcClean$`capital-loss`=(pwcClean$`capital-loss`-min(pwcClean$`capital-loss`))/(max(pwcClean$`capital-loss`)-min(pwcClean$`capital-loss`))
pwcClean$`hours-per-week`=(pwcClean$`hours-per-week`-min(pwcClean$`hours-per-week`))/(max(pwcClean$`hours-per-week`)-min(pwcClean$`hours-per-week`))

###Normalized and cleaned data
pwcClean.normalized <- pwcClean
pwcClean.normalizedCopy <- pwcClean.normalized



[[Decision]] Tree
##pruning the tree with rpart
install.packages("rpart")
library(rpart)
library(rpart.plot)
mytreeadult=rpart(`Income Group`~Education+workclass+`Occupation Status`+Gender+Age+`hours-per-week`
                  , data=pwcClean.normalized, method="class", control=rpart.control(minsplit=1))
mytreeadult
plot(mytreeadult)
rpart.plot(mytreeadult, type=3, extra=101, fallen.leaves = TRUE)
text(mytreeadult, use.n=T, all=T, pretty=0, cex=0.9, xpd=TRUE)
estincome.class=predict(mytreeadult, data=pwcClean.normalized, type="class")

##cross-validation table

t1=table(`Income Group`, estincome.class)
t1
(10061+1354)+(11564+1505)/30511
(10061+1354+11564+1505)/30511



###Training data 20000 out of 30511
index <- sample(1:nrow(pwcClean.normalized), 20000)
pwc.test = pwcClean.normalized[index,]
pwc.valid = pwcClean.normalized[-index,]

###validation data
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc
(7457+1519)/(30511-20000)

[[Neural]] network with 10 hidden layers

require(nnet)
library(nnet)
set.seed(99999999)

pwc.net = nnet(`Income Group`~.,data=pwc.test,size=10)
pwc.valid$est.income = predict(pwc.net,pwc.valid,type="class")
Tpwc=table(pwc.valid$`Income Group`,pwc.valid$est.income)
Tpwc


clear(
  
)
rpart(formula, data=, method=,control=)
###
rpart(formula, data=, method=,control=)
fit<-rpart(Default_On_Payment~Status_Checking_Acc+Duration_in_Months+Credit_History+Purposre_Credit_Taken+
             Credit_Amount+Savings_Acc+Years_At_Present_Employment+Inst_Rt_Income+Marital_Status_Gender+
             Other_Debtors_Guarantors+Current_Address_Yrs+Property+Age+Other_Inst_Plans+Housing+
             Num_CC+Job+Dependents+Telephone+Foreign_Worker,
           data=cust_data, method="class",
           control=rpart.control(minsplit=20, cp=0.01))











```
# Tweets Word Cloud
```R
##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Sentiment_Accelerator")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentimentanalysis)
library(curl)
library(syuzhet)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

[[connect]] to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### Twitter Application
consumerKey="****"
consumerSecret="K****"
accesstoken="****"
accesssecret="****"

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) [[There]] is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC

save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
[[Once]] you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumerKey, consumerSecret, accesstoken, accesssecret )




##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = search_tweets("#rstudioconf", n=100000, include_rts =FALSE)

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(mysentiment[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")

```


Pour citer ce code :

Loyer, Dominique. (2024). misc.R [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

organisateur 2.py

Ce script Python détecte les fichiers en double, suggère le renommage pour la lisibilité, puis organise et copie tous les fichiers dans une structure de dossiers thématique en utilisant des mots-clés et une classification basée sur l'IA (Gemini).

Mots-clés: fichiers, doublons, organisation, classification, IA

import os
import sys
import hashlib
import time
import json
import requests
import shutil  # Module pour copier les fichiers
from collections import defaultdict

# --- Fonctions de base (trouver les doublons) ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def find_duplicates(folder):
    """Trouve les fichiers en double."""
    print(f"\nÉtape 1: Recherche des doublons dans : {folder}")
    hashes = defaultdict(list)
    total_files = sum(len(files) for _, _, files in os.walk(folder))
    scanned_files = 0

    if total_files == 0:
        print("\nDossier vide ou inaccessible.")
        return {}

    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            progress = (scanned_files / total_files) * 100
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nRecherche des doublons terminée.\n")
    return {h: p for h, p in hashes.items() if len(p) > 1}

# --- Fonctions de renommage et suggestion de suppression ---

def get_readability_score(filename):
    """Attribue un score à un nom de fichier."""
    score = len(filename)
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def get_best_filename(filepaths):
    """Choisit le meilleur nom de fichier."""
    return os.path.basename(max(filepaths, key=lambda p: (get_readability_score(os.path.basename(p)), len(p))))

def process_duplicates_and_suggest_deletions(duplicates):
    """Effectue le renommage intelligent et liste les doublons à supprimer."""
    if not duplicates:
        print("\nBonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return []

    print(f"Étape 2: Traitement de {len(duplicates)} groupes de doublons (Renommage intelligent)")
    
    files_to_delete = []
    files_kept = []

    for group_paths in duplicates.values():
        group_paths.sort()
        original_to_keep = group_paths[0]
        original_dir = os.path.dirname(original_to_keep)
        
        best_name = get_best_filename(group_paths)
        final_path = os.path.join(original_dir, best_name)

        if original_to_keep != final_path:
            try:
                if os.path.exists(final_path):
                    final_path = original_to_keep
                else:
                    os.rename(original_to_keep, final_path)
                    print(f"  - Renommé : '{os.path.basename(original_to_keep)}' -> '{best_name}'")
            except OSError as e:
                print(f"  - ERREUR de renommage : {e}. Le nom original est conservé.")
                final_path = original_to_keep
        
        files_kept.append(final_path)

        # Ajoute les autres fichiers à la liste de suppression
        for path_to_delete in group_paths:
            if path_to_delete != original_to_keep:
                files_to_delete.append(path_to_delete)

    print("\nNettoyage des noms terminé.")
    if files_to_delete:
        print("\nLes fichiers suivants sont des doublons et peuvent être supprimés manuellement :")
        for f in files_to_delete:
            print(f"  - {f}")
    
    return files_kept

# --- Fonctions de classement par IA (version améliorée) ---

def classify_file_custom(filepath, categories):
    """Utilise l'IA pour classer un fichier selon des catégories spécifiques UQAM."""
    try:
        filename = os.path.basename(filepath).lower()
        
        # Tentative de classification directe par mots-clés
        for category_name, keywords in categories.items():
            for keyword in keywords:
                if keyword in filename:
                    return category_name

        # Si aucune correspondance, on demande à l'IA
        prompt = (f"Le nom du fichier est '{filename}'. Basé sur les sigles de cours de l'UQAM, "
                  f"classe ce fichier dans l'une de ces catégories : {list(categories.keys())}. "
                  "Exemples: 'projet_dic9345.pdf' va dans 'UQAM - DIC9345 - TALN'. "
                  "'resume_these_v3.docx' va dans 'Thèse de doctorat'. "
                  "Réponds uniquement avec le nom de la catégorie. Si incertain, réponds 'À Classer Manuellement'.")

        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiKey = ""
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
        
        response = requests.post(apiUrl, json=payload, timeout=15)
        response.raise_for_status()
        
        result = response.json()
        category = result.get('candidates')[0]['content']['parts'][0]['text'].strip()
        
        if category in categories:
            return category
        else:
            return "À Classer Manuellement"
            
    except Exception:
        return "À Classer Manuellement"

def classify_and_copy_files(base_folder, all_files):
    """Classe tous les fichiers et en fait une copie dans des dossiers thématiques."""
    print("\nÉtape 3: Classement thématique des fichiers (création de copies)")
    
    # NOUVELLES CATÉGORIES BASÉES SUR VOS IMAGES
    uqam_categories = {
        "UQAM - DIC9251 - Modélisation": ["dic-9251", "dic9251", "modélisation"],
        "UQAM - DIC9335 - Science du web": ["dic-9335", "dic9335", "science du web"],
        "UQAM - DIC9270 - Séminaires": ["dic-9270", "dic9270", "séminaire"],
        "UQAM - DIC9345 - TALN": ["dic-9345", "dic9345", "taln", "langage naturel"],
        "UQAM - DIC9401 - Examen Général": ["dic-9401", "dic9401", "examen général"],
        "UQAM - DIC9411 - Projet Recherche": ["dic-9411", "dic9411", "projet de recherche"],
        "UQAM - DIC9150 - Concepts Fondamentaux": ["dic-9150", "dic9150"],
        "UQAM - DIC9001 - Fondements & Tendances": ["dic-9001", "dic9001"],
        "UQAM - DIC9351 - Apprentissage Machine": ["dic-9351", "dic9351", "apprentissage machine", "machine learning"],
        "Thèse de doctorat": ["thèse", "these", "dic-9500", "dic9500"],
        "À Classer Manuellement": []
    }

    copies_folder = os.path.join(base_folder, "Classement (Copies)")
    os.makedirs(copies_folder, exist_ok=True)
    
    print(f"Les copies classées seront créées dans : {copies_folder}")
    
    total_files_to_classify = len(all_files)
    for i, filepath in enumerate(all_files):
        if not os.path.exists(filepath):
            continue

        progress = ((i + 1) / total_files_to_classify) * 100
        sys.stdout.write(f"\rClassement et copie: {progress:.1f}% - {os.path.basename(filepath)}")
        sys.stdout.flush()

        category = classify_file_custom(filepath, uqam_categories)
        
        target_dir = os.path.join(copies_folder, category)
        os.makedirs(target_dir, exist_ok=True)
        
        try:
            # COPIE DU FICHIER AU LIEU DE DÉPLACEMENT
            shutil.copy2(filepath, target_dir)
        except Exception as e:
            print(f"\nErreur lors de la copie de {filepath} : {e}")
            
    print("\n\nLe classement et la création des copies sont terminés !")

# --- Fonction principale ---

def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        folder_path = folder_path.replace('\\ ', ' ').strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates_found = find_duplicates(folder_path)
        process_duplicates_and_suggest_deletions(duplicates_found)

        print("\nPréparation pour l'étape de classement...")
        # Récupère tous les fichiers restants pour le classement
        all_files_in_folder = set()
        for dirpath, _, filenames in os.walk(folder_path):
            if "Classement (Copies)" in dirpath: # Ignore le dossier de destination
                continue
            for f in filenames:
                all_files_in_folder.add(os.path.join(dirpath, f))
        
        final_list_of_files = [f for f in all_files_in_folder if os.path.exists(f)]

        if final_list_of_files:
            classify_and_copy_files(folder_path, final_list_of_files)
        else:
            print("\nAucun fichier à classer.")

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). organisateur 2.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

organisateur.py

Ce code Python nettoie un dossier en détectant les fichiers en double, en suggérant de supprimer les excédents après un renommage intelligent, puis en organisant les fichiers restants par catégorie thématique via une classification assistée par IA.

Mots-clés: dupliqués, renommage, classification, IA, organisation fichiers

import os
import sys
import hashlib
import time
import json
import requests
import shutil  # Module pour copier les fichiers
from collections import defaultdict

# --- Fonctions de base (trouver les doublons) ---

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def find_duplicates(folder):
    """Trouve les fichiers en double."""
    print(f"\nÉtape 1: Recherche des doublons dans : {folder}")
    hashes = defaultdict(list)
    total_files = sum(len(files) for _, _, files in os.walk(folder))
    scanned_files = 0

    if total_files == 0:
        print("\nDossier vide ou inaccessible.")
        return {}

    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            scanned_files += 1
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
            
            progress = (scanned_files / total_files) * 100
            sys.stdout.write(f"\rProgression: {progress:.1f}% ({scanned_files}/{total_files} fichiers)")
            sys.stdout.flush()

    print("\nRecherche des doublons terminée.\n")
    return {h: p for h, p in hashes.items() if len(p) > 1}

# --- Fonctions de renommage et suggestion de suppression ---

def get_readability_score(filename):
    """Attribue un score à un nom de fichier."""
    score = len(filename)
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def get_best_filename(filepaths):
    """Choisit le meilleur nom de fichier."""
    return os.path.basename(max(filepaths, key=lambda p: (get_readability_score(os.path.basename(p)), len(p))))

def process_duplicates_and_suggest_deletions(duplicates):
    """Effectue le renommage intelligent et liste les doublons à supprimer."""
    if not duplicates:
        print("\nBonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return []

    print(f"Étape 2: Traitement de {len(duplicates)} groupes de doublons (Renommage intelligent)")
    
    files_to_delete = []
    files_kept = []

    for group_paths in duplicates.values():
        group_paths.sort()
        original_to_keep = group_paths[0]
        original_dir = os.path.dirname(original_to_keep)
        
        best_name = get_best_filename(group_paths)
        final_path = os.path.join(original_dir, best_name)

        if original_to_keep != final_path:
            try:
                if os.path.exists(final_path):
                    final_path = original_to_keep
                else:
                    os.rename(original_to_keep, final_path)
                    print(f"  - Renommé : '{os.path.basename(original_to_keep)}' -> '{best_name}'")
            except OSError as e:
                print(f"  - ERREUR de renommage : {e}. Le nom original est conservé.")
                final_path = original_to_keep
        
        files_kept.append(final_path)

        # Ajoute les autres fichiers à la liste de suppression
        for path_to_delete in group_paths:
            if path_to_delete != original_to_keep:
                files_to_delete.append(path_to_delete)

    print("\nNettoyage des noms terminé.")
    if files_to_delete:
        print("\nLes fichiers suivants sont des doublons et peuvent être supprimés manuellement :")
        for f in files_to_delete:
            print(f"  - {f}")
    
    return files_kept

# --- Fonctions de classement par IA (version améliorée) ---

def classify_file_custom(filepath, categories):
    """Utilise l'IA pour classer un fichier selon des catégories spécifiques UQAM."""
    try:
        filename = os.path.basename(filepath).lower()
        
        # Tentative de classification directe par mots-clés
        for category_name, keywords in categories.items():
            for keyword in keywords:
                if keyword in filename:
                    return category_name

        # Si aucune correspondance, on demande à l'IA
        prompt = (f"Le nom du fichier est '{filename}'. Basé sur les sigles de cours de l'UQAM, "
                  f"classe ce fichier dans l'une de ces catégories : {list(categories.keys())}. "
                  "Exemples: 'projet_dic9345.pdf' va dans 'UQAM - DIC9345 - TALN'. "
                  "'resume_these_v3.docx' va dans 'Thèse de doctorat'. "
                  "Réponds uniquement avec le nom de la catégorie. Si incertain, réponds 'À Classer Manuellement'.")

        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiKey = ""
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
        
        response = requests.post(apiUrl, json=payload, timeout=15)
        response.raise_for_status()
        
        result = response.json()
        category = result.get('candidates')[0]['content']['parts'][0]['text'].strip()
        
        if category in categories:
            return category
        else:
            return "À Classer Manuellement"
            
    except Exception:
        return "À Classer Manuellement"

def classify_and_copy_files(base_folder, all_files):
    """Classe tous les fichiers et en fait une copie dans des dossiers thématiques."""
    print("\nÉtape 3: Classement thématique des fichiers (création de copies)")
    
    # NOUVELLES CATÉGORIES BASÉES SUR VOS IMAGES
    uqam_categories = {
        "UQAM - DIC9251 - Modélisation": ["dic-9251", "dic9251", "modélisation"],
        "UQAM - DIC9335 - Science du web": ["dic-9335", "dic9335", "science du web"],
        "UQAM - DIC9270 - Séminaires": ["dic-9270", "dic9270", "séminaire"],
        "UQAM - DIC9345 - TALN": ["dic-9345", "dic9345", "taln", "langage naturel"],
        "UQAM - DIC9401 - Examen Général": ["dic-9401", "dic9401", "examen général"],
        "UQAM - DIC9411 - Projet Recherche": ["dic-9411", "dic9411", "projet de recherche"],
        "UQAM - DIC9150 - Concepts Fondamentaux": ["dic-9150", "dic9150"],
        "UQAM - DIC9001 - Fondements & Tendances": ["dic-9001", "dic9001"],
        "UQAM - DIC9351 - Apprentissage Machine": ["dic-9351", "dic9351", "apprentissage machine", "machine learning"],
        "Thèse de doctorat": ["thèse", "these", "dic-9500", "dic9500"],
        "À Classer Manuellement": []
    }

    copies_folder = os.path.join(base_folder, "Classement (Copies)")
    os.makedirs(copies_folder, exist_ok=True)
    
    print(f"Les copies classées seront créées dans : {copies_folder}")
    
    total_files_to_classify = len(all_files)
    for i, filepath in enumerate(all_files):
        if not os.path.exists(filepath):
            continue

        progress = ((i + 1) / total_files_to_classify) * 100
        sys.stdout.write(f"\rClassement et copie: {progress:.1f}% - {os.path.basename(filepath)}")
        sys.stdout.flush()

        category = classify_file_custom(filepath, uqam_categories)
        
        target_dir = os.path.join(copies_folder, category)
        os.makedirs(target_dir, exist_ok=True)
        
        try:
            # COPIE DU FICHIER AU LIEU DE DÉPLACEMENT
            shutil.copy2(filepath, target_dir)
        except Exception as e:
            print(f"\nErreur lors de la copie de {filepath} : {e}")
            
    print("\n\nLe classement et la création des copies sont terminés !")

# --- Fonction principale ---

def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip()
        folder_path = folder_path.replace('\\ ', ' ').strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates_found = find_duplicates(folder_path)
        process_duplicates_and_suggest_deletions(duplicates_found)

        print("\nPréparation pour l'étape de classement...")
        # Récupère tous les fichiers restants pour le classement
        all_files_in_folder = set()
        for dirpath, _, filenames in os.walk(folder_path):
            if "Classement (Copies)" in dirpath: # Ignore le dossier de destination
                continue
            for f in filenames:
                all_files_in_folder.add(os.path.join(dirpath, f))
        
        final_list_of_files = [f for f in all_files_in_folder if os.path.exists(f)]

        if final_list_of_files:
            classify_and_copy_files(folder_path, final_list_of_files)
        else:
            print("\nAucun fichier à classer.")

    except (EOFError, KeyboardInterrupt):
        print("\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). organisateur.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

organiser_1106.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import sys
import hashlib
import shutil
import json
import requests
from collections import defaultdict
import time

# =============================================================================
# --- ÉTAPE 1: RECHERCHE DE DOUBLONS ---
# =============================================================================

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except (IOError, FileNotFoundError):
        return None

def find_duplicates(folder):
    """Scanne un dossier pour trouver les fichiers avec un contenu identique."""
    print(f"\n--- ÉTAPE 1: Recherche des doublons dans : {folder} ---")
    hashes = defaultdict(list)
    excluded_dirs = ["Doublons", "Classement_Final"]
    files_to_scan = []
    for dirpath, dirnames, filenames in os.walk(folder):
        dirnames[:] = [d for d in dirnames if d not in excluded_dirs]
        for filename in filenames:
            files_to_scan.append(os.path.join(dirpath, filename))

    total_files = len(files_to_scan)
    if total_files == 0:
        return {}, []

    for i, filepath in enumerate(files_to_scan):
        try:
            progress = ((i + 1) / total_files) * 100
            sys.stdout.write(f"\rAnalyse des fichiers : {progress:.1f}% ({i+1}/{total_files})")
            sys.stdout.flush()
            if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
                file_hash = calculate_hash(filepath)
                if file_hash:
                    hashes[file_hash].append(filepath)
        except (FileNotFoundError, OSError):
            continue

    print("\nAnalyse des doublons terminée.")
    duplicates = {h: paths for h, paths in hashes.items() if len(paths) > 1}
    unique_files = [paths[0] for h, paths in hashes.items() if len(paths) == 1]
    return duplicates, unique_files

# =============================================================================
# --- ÉTAPE 2: TRAITEMENT AUTOMATIQUE ET PROTÉGÉ DES DOUBLONS ---
# =============================================================================

def get_readability_score(filename):
    """Attribue un score de 'lisibilité' à un nom de fichier."""
    score = len(filename)
    if any(c.isalpha() for c in filename): score += 20
    if filename.split('.')[0].isdigit(): score -= 10
    return score

def process_duplicates_safely(duplicates, base_folder, desktop_path):
    """
    Traite les doublons avec une règle de protection pour le Bureau uniquement.
    """
    if not duplicates:
        print("\n--- ÉTAPE 2: Traitement des doublons ---\nBonne nouvelle ! Aucun fichier en double à traiter.")
        return []

    print(f"\n--- ÉTAPE 2: Traitement automatique et protégé de {len(duplicates)} groupes de doublons ---")
    
    duplicates_folder = os.path.join(base_folder, "Doublons")
    os.makedirs(duplicates_folder, exist_ok=True)
    print(f"Les doublons non-protégés seront déplacés dans : {duplicates_folder}")

    files_kept = []
    for group_paths in duplicates.values():
        try:
            file_to_keep = None
            
            if desktop_path and any(p.startswith(desktop_path) for p in group_paths):
                desktop_files = [p for p in group_paths if p.startswith(desktop_path)]
                file_to_keep = max(desktop_files, key=os.path.getmtime)
                final_path = file_to_keep
                print(f"\n  -> Règle de priorité: Fichier du Bureau conservé tel quel : {os.path.basename(file_to_keep)}")
            else:
                file_to_keep = max(group_paths, key=os.path.getmtime)
                best_name = os.path.basename(max(group_paths, key=lambda p: get_readability_score(os.path.basename(p))))
                original_dir = os.path.dirname(file_to_keep)
                potential_new_path = os.path.join(original_dir, best_name)
                final_path = file_to_keep

                if file_to_keep != potential_new_path:
                    if not os.path.exists(potential_new_path):
                        os.rename(file_to_keep, potential_new_path)
                        final_path = potential_new_path
            
            files_kept.append(final_path)

            for path in group_paths:
                if path != file_to_keep:
                    if desktop_path and path.startswith(desktop_path):
                        print(f"  -> AVERTISSEMENT: Le doublon '{os.path.basename(path)}' est sur le Bureau et ne sera pas déplacé.")
                        continue
                    try:
                        shutil.move(path, os.path.join(duplicates_folder, os.path.basename(path)))
                    except Exception as e:
                        print(f"Erreur lors du déplacement du doublon {os.path.basename(path)}: {e}")

        except Exception as e:
            print(f"\nErreur lors du traitement d'un groupe de doublons : {e}")
            continue
            
    print("\nTraitement des doublons terminé.")
    return files_kept

# =============================================================================
# --- ÉTAPE 3: CLASSIFICATION IA (AVEC NOUVELLES TENTATIVES) ---
# =============================================================================

def learn_structure_from_path(path_to_learn):
    learned_categories = set()
    if not os.path.isdir(path_to_learn): return learned_categories
    print(f"Analyse de la structure de : {path_to_learn}...")
    for root, dirs, _ in os.walk(path_to_learn):
        for dir_name in dirs:
            if dir_name.startswith('.') or dir_name.lower() in ["attachments", "files", "images"]: continue
            clean_name = dir_name.replace('_', ' ').strip()
            if len(clean_name) > 3: learned_categories.add(clean_name)
    return learned_categories

def propose_categories_with_ai(files, learned_categories, api_key):
    """Propose des catégories pour une liste de fichiers en utilisant l'IA, par lots et avec nouvelles tentatives."""
    print(f"\n--- ÉTAPE 3: Proposition des catégories par IA pour {len(files)} fichiers ---")
    
    BATCH_SIZE = 75  # Taille de lot légèrement réduite
    MAX_RETRIES = 3 # Nombre maximum de nouvelles tentatives par lot
    BACKOFF_FACTOR = 2 # Multiplicateur pour le temps d'attente
    
    all_ai_classifications = {}
    total_files = len(files)
    
    for i in range(0, total_files, BATCH_SIZE):
        batch_files = files[i:i + BATCH_SIZE]
        batch_filenames = [os.path.basename(f) for f in batch_files]
        
        current_batch_num = (i // BATCH_SIZE) + 1
        total_batches = (total_files + BATCH_SIZE - 1) // BATCH_SIZE
        print(f"\nTraitement du lot {current_batch_num}/{total_batches}...")

        prompt = (f"En te basant sur cette liste de catégories apprises : {list(learned_categories)}. "
                  f"Pour la liste de noms de fichiers suivante : {json.dumps(batch_filenames)}. "
                  "Propose la catégorie la plus pertinente pour CHAQUE fichier. "
                  "Si aucune catégorie ne correspond, invente une catégorie pertinente et concise (2-4 mots). "
                  "Réponds UNIQUEMENT avec un objet JSON où chaque clé est un nom de fichier et sa valeur est la catégorie proposée.")
        
        payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
        apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
        
        for attempt in range(MAX_RETRIES):
            try:
                response = requests.post(apiUrl, json=payload, timeout=90)
                response.raise_for_status() # Lève une erreur pour les codes 4xx/5xx
                
                json_text = response.json()['candidates'][0]['content']['parts'][0]['text']
                json_text = json_text.strip().replace("```json", "").replace("```", "")
                
                batch_classifications = json.loads(json_text)
                all_ai_classifications.update(batch_classifications)
                print(f"Lot {current_batch_num} traité avec succès.")
                break # Sortir de la boucle de nouvelles tentatives si réussi

            except requests.exceptions.HTTPError as err:
                if err.response.status_code in [500, 503, 504] and (attempt + 1) < MAX_RETRIES:
                    wait_time = BACKOFF_FACTOR * (2 ** attempt)
                    print(f"  -> Échec temporaire du lot {current_batch_num} (tentative {attempt + 1}/{MAX_RETRIES}): {err.response.status_code}. Nouvelle tentative dans {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"\n[ERREUR HTTP] Échec final du lot {current_batch_num}: {err.response.status_code} {err.response.reason}")
                    print("Ce lot sera ignoré. Le script continuera avec les lots suivants.")
                    break
            except Exception as e:
                print(f"\n[ERREUR] Échec du lot {current_batch_num} : {e}")
                print("Ce lot sera ignoré.")
                break
        
        # Petite pause pour ne pas surcharger l'API entre les lots réussis
        time.sleep(1)

    # Re-mapper les noms de fichiers aux chemins complets
    final_plan = {f: all_ai_classifications.get(os.path.basename(f), "Non Classé") for f in files}
    print("\nProposition des catégories par IA terminée.")
    return final_plan


# =============================================================================
# --- ÉTAPE 4: VALIDATION DES CATÉGORIES ---
# =============================================================================

def validate_categories(classification_plan):
    if not classification_plan: return set()
    proposed_categories = sorted(list(set(classification_plan.values())))
    print("\n--- ÉTAPE 4: Validation des catégories proposées ---")
    print("Veuillez choisir les catégories que vous souhaitez créer comme dossiers.")
    for i, cat_name in enumerate(proposed_categories): print(f"  {i+1}) {cat_name}")
    validated_categories = set()
    while True:
        try:
            choices_str = input("\nEntrez les numéros (ex: 1,3,4), ou 'toutes' : ")
            if choices_str.lower() == 'toutes': return set(proposed_categories)
            chosen_indices = [int(i.strip()) - 1 for i in choices_str.split(',')]
            for index in chosen_indices:
                if 0 <= index < len(proposed_categories): validated_categories.add(proposed_categories[index])
            if validated_categories:
                print("\nCatégories validées :", ", ".join(validated_categories))
                return validated_categories
        except ValueError:
            print("[ERREUR] Entrée invalide.")

# =============================================================================
# --- ÉTAPE 5: ORGANISATION FINALE ---
# =============================================================================

def execute_final_organization(base_folder, classification_plan, validated_categories, desktop_path):
    """Crée les dossiers et déplace les fichiers, EN PROTÉGEANT LE BUREAU."""
    print("\n--- ÉTAPE 5: Organisation finale des fichiers ---")
    output_root = os.path.join(base_folder, "Classement_Final")
    os.makedirs(output_root, exist_ok=True)
    print(f"Les fichiers seront déplacés et organisés dans : {output_root}")

    for cat in validated_categories: os.makedirs(os.path.join(output_root, cat), exist_ok=True)
    unclassified_dir = os.path.join(output_root, "Non Classé")
    os.makedirs(unclassified_dir, exist_ok=True)

    for i, (original_path, category) in enumerate(classification_plan.items()):
        sys.stdout.write(f"\rDéplacement des fichiers : {((i + 1) / len(classification_plan)) * 100:.1f}%")
        sys.stdout.flush()

        if not os.path.exists(original_path): continue
        
        if desktop_path and original_path.startswith(desktop_path):
            continue

        target_dir = os.path.join(output_root, category) if category in validated_categories else unclassified_dir
        try:
            shutil.move(original_path, os.path.join(target_dir, os.path.basename(original_path)))
        except Exception as e:
            print(f"\n[ERREUR] Impossible de déplacer {original_path}. Erreur: {e}")
            
    print("\n\nL'organisation finale est terminée !")
    print(f"Vos fichiers sont maintenant organisés dans le dossier : {output_root}")
    print(f"Note : les fichiers sur votre Bureau n'ont pas été déplacés.")

# =============================================================================
# --- FONCTION PRINCIPALE (main) ---
# =============================================================================

def main():
    try:
        home = os.path.expanduser('~')
        
        desktop_path = ""
        potential_desktop_paths = [os.path.join(home, 'Desktop'), os.path.join(home, 'Bureau')]
        for path in potential_desktop_paths:
            if os.path.isdir(path):
                desktop_path = path
                break
        
        print("--- Assistant d'Organisation v13 (avec Nouvelles Tentatives) ---")
        if desktop_path: print(f"Règle de protection activée pour le Bureau : {desktop_path}")
        
        api_key = input("Veuillez coller votre clé API Google AI Studio ici : ").strip()
        if not api_key:
            print("Aucune clé API fournie. La classification par IA sera désactivée.")
        
        obsidian_path = input("1. (Optionnel) Glissez-déposez votre coffre Obsidian pour apprendre : ").strip().strip("'\"")
        notion_path = input("2. (Optionnel) Glissez-déposez votre export Notion pour apprendre : ").strip().strip("'\"")
        
        learned_categories = learn_structure_from_path(obsidian_path)
        learned_categories.update(learn_structure_from_path(notion_path))
        if learned_categories: print(f"\nApprentissage terminé. {len(learned_categories)} catégories potentielles apprises.")

        folder_path = input("\n3. Maintenant, glissez-déposez le dossier principal à analyser : ").strip().strip("'\"")
        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        duplicates, unique_files = find_duplicates(folder_path)
        files_kept_after_dedup = process_duplicates_safely(duplicates, folder_path, desktop_path)
        
        all_files_to_classify = list(set(unique_files + files_kept_after_dedup))
        all_files_to_classify = [f for f in all_files_to_classify if os.path.exists(f)]

        if not all_files_to_classify:
            print("\nIl ne reste aucun fichier à classer. Opération terminée.")
            return

        classification_plan = {}
        if api_key:
            classification_plan = propose_categories_with_ai(all_files_to_classify, learned_categories, api_key)
        else:
            classification_plan = {f: "Non Classé" for f in all_files_to_classify}

        validated_categories = validate_categories(classification_plan)

        if validated_categories:
            confirm = input("\nPrêt à lancer l'organisation finale ? (o/n) > ").lower()
            if confirm == 'o':
                execute_final_organization(folder_path, classification_plan, validated_categories, desktop_path)
            else:
                print("\nOpération finale annulée.")
        else:
            print("\nAucune catégorie validée. Opération annulée.")

    except (EOFError, KeyboardInterrupt):
        print("\n\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). organiser_1106.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

organizer 3.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import sys
import hashlib
import shutil
import json
import requests
from collections import defaultdict
import time

# =============================================================================
# --- ÉTAPE 1: RECHERCHE DE DOUBLONS ---
# =============================================================================

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier pour identifier les doublons."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError as e:
        print(f"\n[AVERTISSEMENT] Impossible de lire le fichier : {filepath}. Erreur : {e}")
        return None

def find_duplicates(folder, desktop_path):
    """
    Scanne un dossier pour trouver les fichiers avec un contenu identique.
    Retourne un dictionnaire où les clés sont les hash et les valeurs sont les listes de chemins de fichiers.
    """
    print(f"\n--- ÉTAPE 1: Recherche des doublons dans le dossier : {folder} ---")
    hashes = defaultdict(list)
    files_to_scan = []
    # On ne parcourt l'arborescence qu'une seule fois pour la performance
    for dirpath, _, filenames in os.walk(folder):
        # On s'assure de ne pas scanner le futur dossier de sortie
        if "Classement_Final" in dirpath:
            continue
        # On ne scanne pas le bureau s'il n'est pas le dossier de départ
        if desktop_path and desktop_path in dirpath and desktop_path != folder:
             continue
        for filename in filenames:
            files_to_scan.append(os.path.join(dirpath, filename))

    total_files = len(files_to_scan)
    if total_files == 0:
        print("\nLe dossier est vide ou inaccessible.")
        return {}, []

    for i, filepath in enumerate(files_to_scan):
        progress = ((i + 1) / total_files) * 100
        sys.stdout.write(f"\rAnalyse des fichiers : {progress:.1f}% ({i+1}/{total_files})")
        sys.stdout.flush()

        # On ignore les raccourcis et les fichiers vides
        if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
            file_hash = calculate_hash(filepath)
            if file_hash:
                hashes[file_hash].append(filepath)

    print("\nAnalyse des doublons terminée.")
    
    # Sépare les fichiers uniques des doublons
    duplicates = {h: paths for h, paths in hashes.items() if len(paths) > 1}
    unique_files = [paths[0] for h, paths in hashes.items() if len(paths) == 1]
    
    return duplicates, unique_files

# =============================================================================
# --- ÉTAPE 2: TRAITEMENT DES DOUBLONS (INTERACTIF ET PROTÉGÉ) ---
# =============================================================================

def get_readability_score(filename):
    """
    Attribue un score de "lisibilité" à un nom de fichier.
    Un score élevé signifie un nom plus descriptif.
    """
    score = len(filename)
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def process_duplicates(duplicates, desktop_path):
    """
    Pour chaque groupe de doublons, choisit le fichier à garder selon les nouvelles règles.
    Retourne la liste des fichiers à garder et ceux à ignorer.
    """
    if not duplicates:
        print("\n--- ÉTAPE 2: Traitement des doublons ---\nBonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return [], []

    print(f"\n--- ÉTAPE 2: Traitement de {len(duplicates)} groupes de doublons (mode interactif) ---")
    
    files_to_keep = []
    files_to_delete = []

    for group_paths in duplicates.values():
        group_paths.sort()
        
        best_file_to_keep = None
        final_name = ""

        # RÈGLE 1: PRIORITÉ AU FICHIER DU BUREAU (DESKTOP)
        desktop_files = []
        if desktop_path: # Vérifie si un chemin de bureau a été trouvé
            desktop_files = [p for p in group_paths if p.startswith(desktop_path)]

        if desktop_files:
            # On conserve le fichier du bureau (le plus récent si plusieurs sur le bureau)
            best_file_to_keep = max(desktop_files, key=os.path.getmtime)
            final_name = os.path.basename(best_file_to_keep) # Le nom d'origine est conservé
            print(f"\n  Groupe de doublons :")
            print(f"    -> RÈGLE DE PRIORITÉ: Fichier du Bureau conservé : '{final_name}' (nom original protégé)")

        else:
            # RÈGLE 2: AUCUN FICHIER DU BUREAU -> ON GARDE LE PLUS RÉCENT
            best_file_to_keep = max(group_paths, key=os.path.getmtime)
            
            # RÈGLE 3: NOMMAGE INTERACTIF
            # Propose le nom le plus "lisible" du groupe comme suggestion
            suggested_name = os.path.basename(max(group_paths, key=lambda p: get_readability_score(os.path.basename(p))))
            
            print(f"\n  Groupe de doublons :")
            print(f"    -> Fichier à conserver (le plus récent) : '{os.path.basename(best_file_to_keep)}'")
            print(f"    -> Suggestion de nom : '{suggested_name}'")
            
            new_name_input = input("    -> Appuyez sur Entrée pour accepter la suggestion, ou entrez un nouveau nom (avec extension) : ").strip()
            
            if new_name_input:
                final_name = new_name_input
            else:
                final_name = suggested_name
            print(f"    -> Nom final retenu : '{final_name}'")

        # Ajoute le fichier choisi à la liste des fichiers à conserver
        files_to_keep.append({'original_path': best_file_to_keep, 'new_name': final_name})
        
        # Les autres fichiers du groupe sont marqués pour être ignorés
        for path in group_paths:
            if path != best_file_to_keep:
                files_to_delete.append(path)
                print(f"    -> Doublon à ignorer : '{os.path.basename(path)}'")

    print("\nTraitement des doublons terminé.")
    return files_to_keep, files_to_delete


# =============================================================================
# --- ÉTAPE 3: CLASSIFICATION AUTOMATIQUE (LOCALE + IA) ---
# =============================================================================

# Définition des catégories et des mots-clés associés
CATEGORIES = {
    "DIC9001": ["dic-9001", "dic9001", "fondements", "tendances"],
    "DIC9050": ["dic-9050", "dic9050"],
    "DIC9270": ["dic-9270", "dic9270", "séminaire"],
    "DIC9335": ["dic-9335", "dic9335", "science du web"],
    "DIC9345": ["dic-9345", "dic9345", "taln", "langage naturel"],
    "DIC9251": ["dic-9251", "dic9251", "modélisation"],
    "DIC9401": ["dic-9401", "dic9401", "examen général"],
    "UQAM": ["uqam"],
    "Autres": [], # Reste par défaut
}
MANUAL_CLASSIFICATION_KEY = "À Classer Manuellement"

def classify_files_locally(files_to_classify):
    """Tente de classer les fichiers en se basant sur des mots-clés dans leur nom."""
    classified = {}
    unclassified = []
    
    for file_info in files_to_classify:
        # Utilise le nouveau nom pour la classification
        filename = file_info['new_name'].lower()
        filepath = file_info['original_path']
        found = False
        for category, keywords in CATEGORIES.items():
            if any(keyword in filename for keyword in keywords):
                classified[filepath] = category
                found = True
                break
        if not found:
            unclassified.append(file_info)
            
    return classified, unclassified

def batch_classify_with_ai(files_to_classify, categories_list):
    """Envoie une liste de noms de fichiers à l'API Gemini pour une classification en lot."""
    print(f"\nDemande de classification à l'IA pour {len(files_to_classify)} fichiers restants...")
    
    # Utilise les noms finaux (choisis par l'utilisateur) pour la classification
    filenames = [f['new_name'] for f in files_to_classify]
    
    prompt = f"""
    Voici une liste de noms de fichiers : {json.dumps(filenames)}.
    Classe chaque nom de fichier dans l'une des catégories suivantes : {categories_list}.
    Si tu es incertain, utilise la catégorie '{MANUAL_CLASSIFICATION_KEY}'.
    Réponds UNIQUEMENT avec un objet JSON où chaque clé est un nom de fichier et sa valeur est la catégorie.
    Exemple: {{"rapport_dic9345.pdf": "DIC9345", "photo_vacances.jpg": "{MANUAL_CLASSIFICATION_KEY}"}}
    """

    payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
    apiKey = "" 
    apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
    
    try:
        response = requests.post(apiUrl, json=payload, timeout=60)
        response.raise_for_status()
        
        json_text = response.json()['candidates'][0]['content']['parts'][0]['text']
        json_text = json_text.strip().replace("```json", "").replace("```", "")
        
        ai_classifications = json.loads(json_text)
        
        final_classifications = {}
        # Crée une map du nouveau nom vers le chemin original
        new_name_to_path_map = {f['new_name']: f['original_path'] for f in files_to_classify}

        for name, category in ai_classifications.items():
            if name in new_name_to_path_map:
                original_path = new_name_to_path_map[name]
                final_classifications[original_path] = category if category in categories_list else MANUAL_CLASSIFICATION_KEY
        
        print("Classification par IA terminée.")
        return final_classifications

    except requests.exceptions.RequestException as e:
        print(f"\n[ERREUR] Problème de connexion à l'API : {e}")
    except (json.JSONDecodeError, KeyError, IndexError) as e:
        print(f"\n[ERREUR] Impossible d'analyser la réponse de l'IA : {e}")
    
    return {f['original_path']: MANUAL_CLASSIFICATION_KEY for f in files_to_classify}

# =============================================================================
# --- ÉTAPE 4: VALIDATION MANUELLE ---
# =============================================================================

def manual_classification_step(classification_plan, path_to_name_map):
    """Permet à l'utilisateur de valider et de corriger la classification."""
    
    files_to_manually_classify = [
        path for path, cat in classification_plan.items() if cat == MANUAL_CLASSIFICATION_KEY
    ]
    
    if not files_to_manually_classify:
        print("\n--- ÉTAPE 4: Validation Manuelle ---\nExcellente nouvelle ! Tous les fichiers ont été classés automatiquement.")
        return classification_plan
        
    print(f"\n--- ÉTAPE 4: Validation Manuelle ({len(files_to_manually_classify)} fichiers à vérifier) ---")
    
    manual_categories = sorted(list(set(list(CATEGORIES.keys()) + ["UQAM", "Autres"])))
    
    for filepath in files_to_manually_classify:
        # Affiche le nom final choisi par l'utilisateur
        display_name = path_to_name_map.get(filepath, os.path.basename(filepath))
        print(f"\nFichier : {display_name}")
        
        for i, cat_name in enumerate(manual_categories):
            print(f"  {i+1}) {cat_name}")
        
        while True:
            try:
                choice = input("Dans quelle catégorie classer ce fichier ? (Entrez un numéro) > ")
                chosen_category = manual_categories[int(choice) - 1]
                classification_plan[filepath] = chosen_category
                print(f" -> Classé dans : {chosen_category}\n")
                break
            except (ValueError, IndexError):
                print("[ERREUR] Veuillez entrer un numéro valide de la liste.")

    print("Validation manuelle terminée.")
    return classification_plan

# =============================================================================
# --- ÉTAPE 5: EXÉCUTION DU CLASSEMENT ---
# =============================================================================

def execute_classification(base_folder, classification_plan, files_to_keep_info):
    """Crée les dossiers et copie les fichiers selon le plan final."""
    print("\n--- ÉTAPE 5: Création des dossiers de destination ---")
    
    print("\nOù souhaitez-vous créer le dossier 'Classement_Final' ?")
    print(f"1) Dans le dossier analysé ({base_folder})")
    print("2) Choisir un autre emplacement")
    
    destination_folder = ""
    while True:
        choice = input("Votre choix (1 ou 2) ? > ").strip()
        if choice == '1':
            destination_folder = base_folder
            break
        elif choice == '2':
            destination_folder = input("Veuillez glisser-déposer le dossier de destination ici : ").strip().strip("'\"")
            if os.path.isdir(destination_folder):
                break
            else:
                print("[ERREUR] Le chemin fourni n'est pas un dossier valide.")
        else:
            print("[ERREUR] Choix invalide.")
            
    output_root = os.path.join(destination_folder, "Classement_Final")
    os.makedirs(output_root, exist_ok=True)
    print(f"\nLes fichiers seront copiés et organisés dans : {output_root}")

    path_to_new_name_map = {info['original_path']: info['new_name'] for info in files_to_keep_info}
    
    total_files_to_copy = len(classification_plan)
    for i, (original_path, category) in enumerate(classification_plan.items()):
        progress = ((i + 1) / total_files_to_copy) * 100
        sys.stdout.write(f"\rCopie des fichiers : {progress:.1f}%")
        sys.stdout.flush()

        category_dir = os.path.join(output_root, category)
        os.makedirs(category_dir, exist_ok=True)
        
        final_filename = path_to_new_name_map.get(original_path, os.path.basename(original_path))
        destination_path = os.path.join(category_dir, final_filename)
        
        if os.path.exists(destination_path):
            name, ext = os.path.splitext(final_filename)
            destination_path = os.path.join(category_dir, f"{name}_copie_{int(time.time())}{ext}")

        try:
            shutil.copy2(original_path, destination_path)
        except Exception as e:
            print(f"\n[ERREUR] Impossible de copier {original_path} vers {destination_path}. Erreur: {e}")
            
    print("\n\nLe classement et la copie sont terminés avec succès !")
    print(f"Vos fichiers sont maintenant organisés dans le dossier : {output_root}")


# =============================================================================
# --- FONCTION PRINCIPALE (main) ---
# =============================================================================

def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        # Détecter le chemin du Bureau pour la protection des fichiers
        home = os.path.expanduser('~')
        desktop_path_fr = os.path.join(home, 'Bureau')
        desktop_path_en = os.path.join(home, 'Desktop')
        desktop_path = ""
        if os.path.isdir(desktop_path_fr):
            desktop_path = desktop_path_fr
        elif os.path.isdir(desktop_path_en):
            desktop_path = desktop_path_en
        
        if desktop_path:
            print(f"\nINFO: Protection activée pour les fichiers du Bureau ({desktop_path})")

        # Étape 1 & 2: Trouver et traiter les doublons de manière interactive
        duplicates, unique_files_paths = find_duplicates(folder_path, desktop_path)
        files_to_keep_from_duplicates, files_to_delete = process_duplicates(duplicates, desktop_path)
        
        # Confirmation avant de continuer si des doublons sont ignorés
        if files_to_delete:
            print("\n--- Confirmation requise ---")
            print(f"{len(files_to_delete)} fichiers doublons ont été identifiés et seront ignorés (non copiés).")
            confirm_ignore = input("Voulez-vous continuer le processus de classement ? (o/n) > ").lower().strip()
            if confirm_ignore != 'o':
                print("\nOpération annulée par l'utilisateur.")
                return

        # Liste complète de tous les fichiers à garder et à classer
        files_to_classify_info = files_to_keep_from_duplicates + \
            [{'original_path': p, 'new_name': os.path.basename(p)} for p in unique_files_paths]
        
        if not files_to_classify_info:
            print("\nAucun fichier à classer.")
            return

        print(f"\nTotal de {len(files_to_classify_info)} fichiers uniques à classer.")
        path_to_name_map = {f['original_path']: f['new_name'] for f in files_to_classify_info}

        # Étape 3: Classification automatique
        print("\n--- ÉTAPE 3: Lancement de la classification automatique ---")
        classified_locally, remaining_for_ai = classify_files_locally(files_to_classify_info)
        print(f"{len(classified_locally)} fichiers classés localement par mots-clés.")

        ai_classifications = {}
        if remaining_for_ai:
            categories_list_for_ai = sorted(list(CATEGORIES.keys())) + [MANUAL_CLASSIFICATION_KEY]
            ai_classifications = batch_classify_with_ai(remaining_for_ai, categories_list_for_ai)
        
        final_plan = {**classified_locally, **ai_classifications}
        
        # Étape 4: Validation manuelle par l'utilisateur
        final_plan = manual_classification_step(final_plan, path_to_name_map)
        
        # Résumé avant action finale
        print("\n--- Résumé du plan de classement final ---")
        summary = defaultdict(list)
        for path, cat in final_plan.items():
            summary[cat].append(path_to_name_map.get(path, os.path.basename(path)))
        
        for cat, files in summary.items():
            print(f"\nCatégorie '{cat}' ({len(files)} fichiers):")
            for f_name in files[:5]:
                print(f"  - {f_name}")
            if len(files) > 5:
                print(f"  - ... et {len(files) - 5} autres.")
        
        # Étape 5: Exécution
        confirm = input("\nÊtes-vous prêt à créer les dossiers et à copier les fichiers ? (o/n) > ").lower()
        if confirm == 'o':
            execute_classification(folder_path, final_plan, files_to_classify_info)
        else:
            print("\nOpération finale annulée.")

    except (EOFError, KeyboardInterrupt):
        print("\n\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). organizer 3.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

organizer.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import sys
import hashlib
import shutil
import json
import requests
from collections import defaultdict
import time

# =============================================================================
# --- ÉTAPE 1: RECHERCHE DE DOUBLONS ---
# =============================================================================

def calculate_hash(filepath, block_size=65536):
    """Calcule le hash SHA-256 d'un fichier pour identifier les doublons."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while True:
                data = f.read(block_size)
                if not data:
                    break
                sha256.update(data)
        return sha256.hexdigest()
    except IOError as e:
        print(f"\n[AVERTISSEMENT] Impossible de lire le fichier : {filepath}. Erreur : {e}")
        return None

def find_duplicates(folder, desktop_path):
    """
    Scanne un dossier pour trouver les fichiers avec un contenu identique.
    Retourne un dictionnaire où les clés sont les hash et les valeurs sont les listes de chemins de fichiers.
    """
    print(f"\n--- ÉTAPE 1: Recherche des doublons dans le dossier : {folder} ---")
    hashes = defaultdict(list)
    files_to_scan = []
    # On ne parcourt l'arborescence qu'une seule fois pour la performance
    for dirpath, _, filenames in os.walk(folder):
        # On s'assure de ne pas scanner le futur dossier de sortie
        if "Classement_Final" in dirpath:
            continue
        # On ne scanne pas le bureau s'il n'est pas le dossier de départ
        if desktop_path and desktop_path in dirpath and desktop_path != folder:
             continue
        for filename in filenames:
            files_to_scan.append(os.path.join(dirpath, filename))

    total_files = len(files_to_scan)
    if total_files == 0:
        print("\nLe dossier est vide ou inaccessible.")
        return {}, []

    for i, filepath in enumerate(files_to_scan):
        progress = ((i + 1) / total_files) * 100
        sys.stdout.write(f"\rAnalyse des fichiers : {progress:.1f}% ({i+1}/{total_files})")
        sys.stdout.flush()

        # On ignore les raccourcis et les fichiers vides
        if not os.path.islink(filepath) and os.path.getsize(filepath) > 0:
            file_hash = calculate_hash(filepath)
            if file_hash:
                hashes[file_hash].append(filepath)

    print("\nAnalyse des doublons terminée.")
    
    # Sépare les fichiers uniques des doublons
    duplicates = {h: paths for h, paths in hashes.items() if len(paths) > 1}
    unique_files = [paths[0] for h, paths in hashes.items() if len(paths) == 1]
    
    return duplicates, unique_files

# =============================================================================
# --- ÉTAPE 2: TRAITEMENT DES DOUBLONS (INTERACTIF ET PROTÉGÉ) ---
# =============================================================================

def get_readability_score(filename):
    """
    Attribue un score de "lisibilité" à un nom de fichier.
    Un score élevé signifie un nom plus descriptif.
    """
    score = len(filename)
    if any(c.isalpha() for c in filename):
        score += 20
    if filename.split('.')[0].isdigit():
        score -= 10
    return score

def process_duplicates(duplicates, desktop_path):
    """
    Pour chaque groupe de doublons, choisit le fichier à garder selon les nouvelles règles.
    Retourne la liste des fichiers à garder et ceux à ignorer.
    """
    if not duplicates:
        print("\n--- ÉTAPE 2: Traitement des doublons ---\nBonne nouvelle ! Aucun fichier en double n'a été trouvé.")
        return [], []

    print(f"\n--- ÉTAPE 2: Traitement de {len(duplicates)} groupes de doublons (mode interactif) ---")
    
    files_to_keep = []
    files_to_delete = []

    for group_paths in duplicates.values():
        group_paths.sort()
        
        best_file_to_keep = None
        final_name = ""

        # RÈGLE 1: PRIORITÉ AU FICHIER DU BUREAU (DESKTOP)
        desktop_files = []
        if desktop_path: # Vérifie si un chemin de bureau a été trouvé
            desktop_files = [p for p in group_paths if p.startswith(desktop_path)]

        if desktop_files:
            # On conserve le fichier du bureau (le plus récent si plusieurs sur le bureau)
            best_file_to_keep = max(desktop_files, key=os.path.getmtime)
            final_name = os.path.basename(best_file_to_keep) # Le nom d'origine est conservé
            print(f"\n  Groupe de doublons :")
            print(f"    -> RÈGLE DE PRIORITÉ: Fichier du Bureau conservé : '{final_name}' (nom original protégé)")

        else:
            # RÈGLE 2: AUCUN FICHIER DU BUREAU -> ON GARDE LE PLUS RÉCENT
            best_file_to_keep = max(group_paths, key=os.path.getmtime)
            
            # RÈGLE 3: NOMMAGE INTERACTIF
            # Propose le nom le plus "lisible" du groupe comme suggestion
            suggested_name = os.path.basename(max(group_paths, key=lambda p: get_readability_score(os.path.basename(p))))
            
            print(f"\n  Groupe de doublons :")
            print(f"    -> Fichier à conserver (le plus récent) : '{os.path.basename(best_file_to_keep)}'")
            print(f"    -> Suggestion de nom : '{suggested_name}'")
            
            new_name_input = input("    -> Appuyez sur Entrée pour accepter la suggestion, ou entrez un nouveau nom (avec extension) : ").strip()
            
            if new_name_input:
                final_name = new_name_input
            else:
                final_name = suggested_name
            print(f"    -> Nom final retenu : '{final_name}'")

        # Ajoute le fichier choisi à la liste des fichiers à conserver
        files_to_keep.append({'original_path': best_file_to_keep, 'new_name': final_name})
        
        # Les autres fichiers du groupe sont marqués pour être ignorés
        for path in group_paths:
            if path != best_file_to_keep:
                files_to_delete.append(path)
                print(f"    -> Doublon à ignorer : '{os.path.basename(path)}'")

    print("\nTraitement des doublons terminé.")
    return files_to_keep, files_to_delete


# =============================================================================
# --- ÉTAPE 3: CLASSIFICATION AUTOMATIQUE (LOCALE + IA) ---
# =============================================================================

# Définition des catégories et des mots-clés associés
CATEGORIES = {
    "DIC9001": ["dic-9001", "dic9001", "fondements", "tendances"],
    "DIC9050": ["dic-9050", "dic9050"],
    "DIC9270": ["dic-9270", "dic9270", "séminaire"],
    "DIC9335": ["dic-9335", "dic9335", "science du web"],
    "DIC9345": ["dic-9345", "dic9345", "taln", "langage naturel"],
    "DIC9251": ["dic-9251", "dic9251", "modélisation"],
    "DIC9401": ["dic-9401", "dic9401", "examen général"],
    "UQAM": ["uqam"],
    "Autres": [], # Reste par défaut
}
MANUAL_CLASSIFICATION_KEY = "À Classer Manuellement"

def classify_files_locally(files_to_classify):
    """Tente de classer les fichiers en se basant sur des mots-clés dans leur nom."""
    classified = {}
    unclassified = []
    
    for file_info in files_to_classify:
        # Utilise le nouveau nom pour la classification
        filename = file_info['new_name'].lower()
        filepath = file_info['original_path']
        found = False
        for category, keywords in CATEGORIES.items():
            if any(keyword in filename for keyword in keywords):
                classified[filepath] = category
                found = True
                break
        if not found:
            unclassified.append(file_info)
            
    return classified, unclassified

def batch_classify_with_ai(files_to_classify, categories_list):
    """Envoie une liste de noms de fichiers à l'API Gemini pour une classification en lot."""
    print(f"\nDemande de classification à l'IA pour {len(files_to_classify)} fichiers restants...")
    
    # Utilise les noms finaux (choisis par l'utilisateur) pour la classification
    filenames = [f['new_name'] for f in files_to_classify]
    
    prompt = f"""
    Voici une liste de noms de fichiers : {json.dumps(filenames)}.
    Classe chaque nom de fichier dans l'une des catégories suivantes : {categories_list}.
    Si tu es incertain, utilise la catégorie '{MANUAL_CLASSIFICATION_KEY}'.
    Réponds UNIQUEMENT avec un objet JSON où chaque clé est un nom de fichier et sa valeur est la catégorie.
    Exemple: {{"rapport_dic9345.pdf": "DIC9345", "photo_vacances.jpg": "{MANUAL_CLASSIFICATION_KEY}"}}
    """

    payload = {"contents": [{"role": "user", "parts": [{"text": prompt}]}]}
    apiKey = "" 
    apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
    
    try:
        response = requests.post(apiUrl, json=payload, timeout=60)
        response.raise_for_status()
        
        json_text = response.json()['candidates'][0]['content']['parts'][0]['text']
        json_text = json_text.strip().replace("```json", "").replace("```", "")
        
        ai_classifications = json.loads(json_text)
        
        final_classifications = {}
        # Crée une map du nouveau nom vers le chemin original
        new_name_to_path_map = {f['new_name']: f['original_path'] for f in files_to_classify}

        for name, category in ai_classifications.items():
            if name in new_name_to_path_map:
                original_path = new_name_to_path_map[name]
                final_classifications[original_path] = category if category in categories_list else MANUAL_CLASSIFICATION_KEY
        
        print("Classification par IA terminée.")
        return final_classifications

    except requests.exceptions.RequestException as e:
        print(f"\n[ERREUR] Problème de connexion à l'API : {e}")
    except (json.JSONDecodeError, KeyError, IndexError) as e:
        print(f"\n[ERREUR] Impossible d'analyser la réponse de l'IA : {e}")
    
    return {f['original_path']: MANUAL_CLASSIFICATION_KEY for f in files_to_classify}

# =============================================================================
# --- ÉTAPE 4: VALIDATION MANUELLE ---
# =============================================================================

def manual_classification_step(classification_plan, path_to_name_map):
    """Permet à l'utilisateur de valider et de corriger la classification."""
    
    files_to_manually_classify = [
        path for path, cat in classification_plan.items() if cat == MANUAL_CLASSIFICATION_KEY
    ]
    
    if not files_to_manually_classify:
        print("\n--- ÉTAPE 4: Validation Manuelle ---\nExcellente nouvelle ! Tous les fichiers ont été classés automatiquement.")
        return classification_plan
        
    print(f"\n--- ÉTAPE 4: Validation Manuelle ({len(files_to_manually_classify)} fichiers à vérifier) ---")
    
    manual_categories = sorted(list(set(list(CATEGORIES.keys()) + ["UQAM", "Autres"])))
    
    for filepath in files_to_manually_classify:
        # Affiche le nom final choisi par l'utilisateur
        display_name = path_to_name_map.get(filepath, os.path.basename(filepath))
        print(f"\nFichier : {display_name}")
        
        for i, cat_name in enumerate(manual_categories):
            print(f"  {i+1}) {cat_name}")
        
        while True:
            try:
                choice = input("Dans quelle catégorie classer ce fichier ? (Entrez un numéro) > ")
                chosen_category = manual_categories[int(choice) - 1]
                classification_plan[filepath] = chosen_category
                print(f" -> Classé dans : {chosen_category}\n")
                break
            except (ValueError, IndexError):
                print("[ERREUR] Veuillez entrer un numéro valide de la liste.")

    print("Validation manuelle terminée.")
    return classification_plan

# =============================================================================
# --- ÉTAPE 5: EXÉCUTION DU CLASSEMENT ---
# =============================================================================

def execute_classification(base_folder, classification_plan, files_to_keep_info):
    """Crée les dossiers et copie les fichiers selon le plan final."""
    print("\n--- ÉTAPE 5: Création des dossiers de destination ---")
    
    print("\nOù souhaitez-vous créer le dossier 'Classement_Final' ?")
    print(f"1) Dans le dossier analysé ({base_folder})")
    print("2) Choisir un autre emplacement")
    
    destination_folder = ""
    while True:
        choice = input("Votre choix (1 ou 2) ? > ").strip()
        if choice == '1':
            destination_folder = base_folder
            break
        elif choice == '2':
            destination_folder = input("Veuillez glisser-déposer le dossier de destination ici : ").strip().strip("'\"")
            if os.path.isdir(destination_folder):
                break
            else:
                print("[ERREUR] Le chemin fourni n'est pas un dossier valide.")
        else:
            print("[ERREUR] Choix invalide.")
            
    output_root = os.path.join(destination_folder, "Classement_Final")
    os.makedirs(output_root, exist_ok=True)
    print(f"\nLes fichiers seront copiés et organisés dans : {output_root}")

    path_to_new_name_map = {info['original_path']: info['new_name'] for info in files_to_keep_info}
    
    total_files_to_copy = len(classification_plan)
    for i, (original_path, category) in enumerate(classification_plan.items()):
        progress = ((i + 1) / total_files_to_copy) * 100
        sys.stdout.write(f"\rCopie des fichiers : {progress:.1f}%")
        sys.stdout.flush()

        category_dir = os.path.join(output_root, category)
        os.makedirs(category_dir, exist_ok=True)
        
        final_filename = path_to_new_name_map.get(original_path, os.path.basename(original_path))
        destination_path = os.path.join(category_dir, final_filename)
        
        if os.path.exists(destination_path):
            name, ext = os.path.splitext(final_filename)
            destination_path = os.path.join(category_dir, f"{name}_copie_{int(time.time())}{ext}")

        try:
            shutil.copy2(original_path, destination_path)
        except Exception as e:
            print(f"\n[ERREUR] Impossible de copier {original_path} vers {destination_path}. Erreur: {e}")
            
    print("\n\nLe classement et la copie sont terminés avec succès !")
    print(f"Vos fichiers sont maintenant organisés dans le dossier : {output_root}")


# =============================================================================
# --- FONCTION PRINCIPALE (main) ---
# =============================================================================

def main():
    try:
        folder_path = input("Veuillez glisser-déposer le dossier à analyser ici, puis appuyez sur Entrée : ").strip().strip("'\"")

        if not os.path.isdir(folder_path):
            print("\nErreur : Le chemin fourni n'est pas un dossier valide.")
            return

        # Détecter le chemin du Bureau pour la protection des fichiers
        home = os.path.expanduser('~')
        desktop_path_fr = os.path.join(home, 'Bureau')
        desktop_path_en = os.path.join(home, 'Desktop')
        desktop_path = ""
        if os.path.isdir(desktop_path_fr):
            desktop_path = desktop_path_fr
        elif os.path.isdir(desktop_path_en):
            desktop_path = desktop_path_en
        
        if desktop_path:
            print(f"\nINFO: Protection activée pour les fichiers du Bureau ({desktop_path})")

        # Étape 1 & 2: Trouver et traiter les doublons de manière interactive
        duplicates, unique_files_paths = find_duplicates(folder_path, desktop_path)
        files_to_keep_from_duplicates, files_to_delete = process_duplicates(duplicates, desktop_path)
        
        # Confirmation avant de continuer si des doublons sont ignorés
        if files_to_delete:
            print("\n--- Confirmation requise ---")
            print(f"{len(files_to_delete)} fichiers doublons ont été identifiés et seront ignorés (non copiés).")
            confirm_ignore = input("Voulez-vous continuer le processus de classement ? (o/n) > ").lower().strip()
            if confirm_ignore != 'o':
                print("\nOpération annulée par l'utilisateur.")
                return

        # Liste complète de tous les fichiers à garder et à classer
        files_to_classify_info = files_to_keep_from_duplicates + \
            [{'original_path': p, 'new_name': os.path.basename(p)} for p in unique_files_paths]
        
        if not files_to_classify_info:
            print("\nAucun fichier à classer.")
            return

        print(f"\nTotal de {len(files_to_classify_info)} fichiers uniques à classer.")
        path_to_name_map = {f['original_path']: f['new_name'] for f in files_to_classify_info}

        # Étape 3: Classification automatique
        print("\n--- ÉTAPE 3: Lancement de la classification automatique ---")
        classified_locally, remaining_for_ai = classify_files_locally(files_to_classify_info)
        print(f"{len(classified_locally)} fichiers classés localement par mots-clés.")

        ai_classifications = {}
        if remaining_for_ai:
            categories_list_for_ai = sorted(list(CATEGORIES.keys())) + [MANUAL_CLASSIFICATION_KEY]
            ai_classifications = batch_classify_with_ai(remaining_for_ai, categories_list_for_ai)
        
        final_plan = {**classified_locally, **ai_classifications}
        
        # Étape 4: Validation manuelle par l'utilisateur
        final_plan = manual_classification_step(final_plan, path_to_name_map)
        
        # Résumé avant action finale
        print("\n--- Résumé du plan de classement final ---")
        summary = defaultdict(list)
        for path, cat in final_plan.items():
            summary[cat].append(path_to_name_map.get(path, os.path.basename(path)))
        
        for cat, files in summary.items():
            print(f"\nCatégorie '{cat}' ({len(files)} fichiers):")
            for f_name in files[:5]:
                print(f"  - {f_name}")
            if len(files) > 5:
                print(f"  - ... et {len(files) - 5} autres.")
        
        # Étape 5: Exécution
        confirm = input("\nÊtes-vous prêt à créer les dossiers et à copier les fichiers ? (o/n) > ").lower()
        if confirm == 'o':
            execute_classification(folder_path, final_plan, files_to_classify_info)
        else:
            print("\nOpération finale annulée.")

    except (EOFError, KeyboardInterrupt):
        print("\n\nProgramme interrompu par l'utilisateur.")
    except Exception as e:
        print(f"\nUne erreur inattendue est survenue : {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Pour citer ce code :

Loyer, Dominique. (2024). organizer.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

parallèlisation_projet1_taln_7_8avril2025.ipynb

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# === Cellule 0: Configuration Complète (avec Stemming) ===
# Installe Java 21, configure comme défaut, installe outils build,
# pybind11, dernière Pyserini, NLTK+ressources, définit chemins,
# FONCTION preprocess_text AVEC STEMMING, parse topics.

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète (avec Stemming) ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try: subprocess.run(install_java_cmd, shell=True, check=True, timeout=180); print("OpenJDK 21 installé.")
except Exception as e: print(f"ERREUR installation Java 21: {e}"); raise

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try: subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True); subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True); print("update-alternatives configuré.")
    except Exception as e: print(f"ERREUR config update-alternatives: {e}")
else: print(f"ATTENTION: Chemin Java 21 non trouvé: {java_path_21}.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]): print(f"ATTENTION: Chemin JAVA_HOME inexistant.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try: subprocess.run(install_build_cmd, shell=True, check=True, timeout=180); print("Outils de build installés.")
except Exception as e_build: print(f"ERREUR installation outils de build: {e_build}")

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q"
try: subprocess.run(install_pybind_cmd, shell=True, check=True, timeout=60); print("pybind11 installé.")
except Exception as e_pybind: print(f"ERREUR installation pybind11: {e_pybind}")

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try: result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600); print("Paquets Python principaux installés.")
except Exception as e_pip: print(f"ERREUR installation pip: {e_pip}"); raise

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab'] # Liste corrigée
for resource in nltk_resources:
    try:
        if resource == 'punkt' or resource == 'punkt_tab': resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4': resource_path = f'corpora/{resource}.zip'
        elif resource == 'wordnet': resource_path = f'corpora/{resource}.zip' # Garder wordnet pour l'instant, même si non utilisé par stemmer
        else: resource_path = f'corpora/{resource}.zip'
        nltk.data.find(resource_path)
    except LookupError:
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try: nltk.download(resource, quiet=True); print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download: print(f"  ERREUR téléchargement '{resource}': {e_download}")
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")
# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

if 'google.colab' in sys.modules:
    try: from google.colab import drive; drive.mount('/content/drive', force_remount=True); print("  Google Drive monté.")
    except Exception as e_mount: print(f"ATTENTION: Erreur montage Drive: {e_mount}")
if not os.path.exists(DRIVE_PROJECT_PATH): raise FileNotFoundError(f"Chemin Drive '{DRIVE_PROJECT_PATH}' inexistant.")
print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar"
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Sera recréé avec stemming
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True); os.makedirs(INDEX_DIR_BASELINE, exist_ok=True); os.makedirs(INDEX_DIR_PREPROC, exist_ok=True);
os.makedirs(CORPUS_DIR, exist_ok=True); os.makedirs(RUN_DIR, exist_ok=True); os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement (AVEC STEMMING) ---
print("\n[8/9] Définition de la fonction preprocess_text (avec Stemming)...")
import nltk
from nltk.corpus import stopwords
# --- Utilisation de PorterStemmer ---
from nltk.stem import PorterStemmer # Import du stemmer
from nltk.tokenize import word_tokenize
import string
stop_words_set_global = set(stopwords.words('english'))
# --- Création de l'objet Stemmer ---
stemmer_obj_global = PorterStemmer() # Création de l'objet
def preprocess_text(text):
    """Applique tokenisation, minuscules, suppression ponctuation/non-alpha, stop words ET STEMMING (Porter)."""
    if not isinstance(text, str): return ""
    try: tokens = word_tokenize(text.lower())
    except LookupError as e_tok: # Gestion erreur si ressource NLTK manque
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]; print(f"--- Tokenizer a besoin de '{resource_name}', tentative téléchargement ---")
              try: nltk.download(resource_name, quiet=True); print(f"--- Ressource '{resource_name}' téléchargée ---"); tokens = word_tokenize(text.lower())
              except Exception as e_dl_tok: print(f"--- Échec téléchargement '{resource_name}': {e_dl_tok} ---"); raise e_tok
         else: raise e_tok
    except Exception as e_tok_other: print(f"Erreur word_tokenize: {e_tok_other}"); raise e_tok_other
    # --- Application du Stemmer ---
    filtered_tokens = [stemmer_obj_global.stem(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie avec PorterStemmer.")
# Tester la nouvelle fonction
sample_text = "This is an example showing Information Retrieval with stemming and stop words removal."
stemmed_sample = preprocess_text(sample_text)
print(f"  Exemple Stemmed: {stemmed_sample}") # Doit afficher 'thi is exampl show inform retriev with stem and stop word remov.'

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
import re
import glob
def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {};
    try:
        with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
        for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
            topic_content = top_match.group(1)
            num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE); topic_id = num_match.group(1).strip() if num_match else None
            title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL); title = title_match.group(1).strip() if title_match else ""
            desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL); desc = desc_match.group(1).strip() if desc_match else ""
            if topic_id and title: topics[topic_id] = {'title': title, 'desc': desc}
    except Exception as e_topic: print(f"  ATTENTION: Erreur parsing {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR): print(f"ATTENTION: Dossier topics '{TOPICS_DIR}' inexistant."); topic_files = []
else: topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
all_topics = {}
if not topic_files: print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else: print(f"  Parsing fichiers topics: {topic_files}"); [all_topics.update(parse_topics(tf)) for tf in topic_files]

try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés."); print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes (avec stemming)...")
    # Appliquer la NOUVELLE fonction preprocess_text (avec stemming)
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries: print(f"\nERREUR prétraitement requêtes: {e_preproc_queries}"); queries_short_preprocessed, queries_long_preprocessed = {}, {}

# --- Vérification Finale Java ---
print("\n--- Vérification Finale Version Java Active ---")
try: result = subprocess.run("java -version", shell=True, check=True, capture_output=True, text=True, timeout=10); print("STDERR:\n", result.stderr); print("\nConfirmation: Java 21 OK." if "21." in result.stderr else "\nATTENTION: Java 21 NON ACTIF ?!")
except Exception as e: print(f"\nERREUR vérification Java: {e}")
# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale Version Pyserini Installée ---")
try: result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30); print(result_pyserini.stdout)
except Exception as e: print(f"ERREUR vérification Pyserini: {e}")

print("\n--- Configuration Complète (avec Stemming) Terminée ---")
print("\nPause..."); time.sleep(2); print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule 1: Extraire, Décompresser et Formater les Documents ===
# Lit AP.tar, décompresse les .gz internes, extrait <DOC>, <DOCNO>, <TEXT>
# et écrit le résultat dans ap_docs.jsonl.

import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    AP_TAR_PATH
    CORPUS_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.") # Devrait être ~275Mo

# Regex pour extraire les infos
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteurs
doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

try:
    # Ouvrir le fichier de sortie et l'archive TAR
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.") # Devrait être ~1051

        # Boucler sur chaque membre de l'archive
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer si ce n'est pas un fichier .gz ou .Z
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Réinitialiser pour chaque fichier

            try:
                # Extraire le contenu compressé
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # Décompresser le contenu
                    try:
                        content_bytes = gzip.decompress(compressed_content)
                        content = content_bytes.decode('utf-8', errors='ignore') # Décoder après décompression
                    except gzip.BadGzipFile: # Gérer si ce n'est pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au suivant

                    # Trouver tous les blocs <DOC> dans le contenu décompressé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches: continue # Passer si aucun doc trouvé

                    # Boucler sur chaque document trouvé
                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match: continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        # Nettoyer le texte (espaces multiples)
                        doc_text = ' '.join(text_match.group(1).strip().split()) if text_match else ""

                        # Écrire la ligne JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key: print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}"); skipped_members += 1
            except EOFError: print(f"\nAvertissement: Fin fichier inattendue {member.name}."); skipped_members += 1
            except Exception as e_extract: print(f"\nErreur extraction/lecture {member.name}: {e_extract}"); skipped_members += 1

except tarfile.ReadError as e_tar: print(f"\nERREUR lecture TAR {AP_TAR_PATH}: {e_tar}"); raise e_tar
except FileNotFoundError: print(f"\nERREUR: Fichier TAR {AP_TAR_PATH} non trouvé."); raise FileNotFoundError
except Exception as e_general: print(f"\nERREUR générale traitement TAR: {e_general}"); traceback.print_exc(); raise e_general

# Afficher le résumé de l'extraction
print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0: print(f"  {decompression_errors} erreurs/avertissements décompression.")
print(f"  {doc_count} documents écrits dans {JSONL_OUTPUT_PATH}") # Devrait être ~240k

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale {JSONL_OUTPUT_PATH}: {output_size} octets.") # Devrait être ~600Mo
    if output_size > 0 and doc_count > 0: print("  SUCCÈS: Fichier de sortie contient des données.")
    else: print("  ATTENTION: Fichier de sortie vide ou aucun document écrit.")
else: print(f"  ATTENTION: Fichier {JSONL_OUTPUT_PATH} non créé.")



# --- Nouvelle Cellule ---

    # === Cellule 2: Indexation Baseline (Relance) ===
    # Crée l'index Lucene à partir de ap_docs.jsonl.

    import os
    import subprocess
    import traceback
    import re # Importer re

    # Vérifier que les chemins sont définis
    try: CORPUS_DIR; INDEX_DIR_BASELINE
    except NameError: print("ERREUR: Variables chemin non définies. Exécutez config complète."); raise

    print(f"Début de l'indexation Baseline...")
    print(f"Dossier source: {CORPUS_DIR}")
    print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

    jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
    # Vérifier si le fichier source existe (il devrait exister après l'extraction récente)
    if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
         print(f"ERREUR: Fichier source {jsonl_source_path} manquant/vide. Relancez l'extraction (Étape 1).")
         raise FileNotFoundError(f"Fichier source {jsonl_source_path} manquant/vide.")

    # Commande Pyserini
    index_cmd_baseline = [
        "python", "-m", "pyserini.index.lucene", "--collection", "JsonCollection",
        "--input", CORPUS_DIR, "--index", INDEX_DIR_BASELINE,
        "--generator", "DefaultLuceneDocumentGenerator", "--threads", "4",
        "--storePositions", "--storeDocvectors", "--storeRaw"
    ]

    print(f"Exécution: {' '.join(index_cmd_baseline)}")
    try:
        # Exécuter l'indexation
        result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
        print("Sortie STDOUT (fin):\n", result.stdout[-1000:])
        print("Sortie STDERR:\n", result.stderr)
        num_docs_indexed_str = "inconnu"; match = re.search(r"Total (\d+) documents indexed", result.stdout)
        if match: num_docs_indexed_str = match.group(1)
        if "Total 0 documents indexed" in result.stdout: print("\nATTENTION: 0 document indexé.")
        else: print(f"\nIndexation Baseline terminée. {num_docs_indexed_str} documents indexés: {INDEX_DIR_BASELINE}")
    except Exception as e:
        # Gérer les erreurs
        print(f"\nERREUR indexation Baseline: {e}")
        if isinstance(e, subprocess.CalledProcessError): print("STDERR:", e.stderr)
        else: traceback.print_exc()
        raise e

    # Vérifier la taille
    print(f"\nVérification taille index: {INDEX_DIR_BASELINE}...")
    if os.path.exists(INDEX_DIR_BASELINE):
        du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'";
        try: result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True); print(f"  Taille: {result_du.stdout.split()[0]}")
        except Exception as e_du: print(f"  Impossible vérifier taille: {e_du}")
    else: print("  ATTENTION: Dossier index non créé.")


# --- Nouvelle Cellule ---

# === Cellule 2: Indexation Baseline ===
# Crée l'index Lucene à partir de ap_docs.jsonl (sans prétraitement spécifique).

import os
import subprocess
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Début de l'indexation Baseline...")
print(f"Dossier source: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction ('extract_code_tar_gzip_fixed') a échoué.")

# Commande Pyserini
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Nombre de threads pour l'indexation
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options de stockage
]

print(f"Exécution: {' '.join(index_cmd_baseline)}")
try:
    # Exécuter la commande d'indexation
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:]) # Afficher la fin de stdout
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si 0 document a été indexé (signe de problème)
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique 0 document indexé.")
    else:
         # Extraire le nombre de documents indexés si possible (peut varier selon la sortie de Pyserini)
         num_docs_indexed_str = "inconnu"
         match = re.search(r"Total (\d+) documents indexed", result.stdout)
         if match:
             num_docs_indexed_str = match.group(1)
         print(f"\nIndexation Baseline terminée. {num_docs_indexed_str} documents indexés dans {INDEX_DIR_BASELINE}")

except Exception as e:
    # Gérer les erreurs potentielles
    print(f"\nERREUR pendant l'indexation Baseline: {e}")
    if isinstance(e, subprocess.CalledProcessError):
        print("Sortie STDOUT:\n", e.stdout)
        print("Sortie STDERR:\n", e.stderr)
    else:
        traceback.print_exc()
    raise e

# Vérifier la taille de l'index créé
print(f"\nVérification taille index: {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'" # Commande pour taille dossier
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille: {result_du.stdout.split()[0]}") # Afficher la taille
    except Exception as e_du:
        print(f"  Impossible de vérifier taille: {e_du}")
else:
    print("  ATTENTION: Dossier index non créé.")


# --- Nouvelle Cellule ---

# === Cellule 3: Préparer les Données Prétraitées (avec Stemming) ===
# Lit ap_docs.jsonl, applique la fonction preprocess_text (avec stemming)
# et écrit le résultat dans ap_docs_preprocessed.jsonl.

import json
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins et la fonction sont définis
try:
    CORPUS_DIR
    JSONL_OUTPUT_PATH # Défini dans la config, utilisé comme entrée ici
    preprocess_text # Défini dans la config (version stemming)
except NameError:
    print("ERREUR: Variables/Fonction non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier de sortie

print(f"Préparation données prétraitées (avec stemming) depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# Vérifier fichier source
if not os.path.exists(JSONL_OUTPUT_PATH) or os.path.getsize(JSONL_OUTPUT_PATH) == 0:
     raise FileNotFoundError(f"Le fichier source {JSONL_OUTPUT_PATH} est manquant ou vide.")

doc_count_preproc = 0
error_count = 0
try:
    # Ouvrir les fichiers d'entrée et de sortie
    with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
         open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:
        # Boucler sur chaque ligne du fichier d'entrée
        for line in tqdm(infile, desc="Prétraitement (Stemming)"):
            try:
                data = json.loads(line)
                doc_id = data.get('id', None)
                original_contents = data.get('contents', '')
                if doc_id is None: error_count += 1; continue

                # Appliquer le prétraitement (avec stemming)
                preprocessed_contents = preprocess_text(original_contents)

                # Écrire la ligne traitée
                json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                outfile.write(json_line + '\n')
                doc_count_preproc += 1
            except json.JSONDecodeError: error_count += 1 # Compter les erreurs JSON
            except Exception as e_line: print(f"\nErreur ligne (id={data.get('id', 'inconnu')}): {e_line}"); error_count += 1

    # Afficher le résumé
    print(f"\nTerminé.")
    print(f"  {doc_count_preproc} documents prétraités (stemming) écrits dans {JSONL_PREPROC_PATH}")
    if error_count > 0: print(f"  {error_count} lignes ignorées.")

    # Vérifier la taille du fichier de sortie
    if os.path.exists(JSONL_PREPROC_PATH):
        output_size = os.path.getsize(JSONL_PREPROC_PATH)
        print(f"  Taille finale: {output_size} octets.") # Sera probablement plus petit que l'original
        if output_size == 0 and doc_count_preproc > 0: print("  ATTENTION: Taille nulle ?!")
    else: print(f"  ATTENTION: Fichier sortie {JSONL_PREPROC_PATH} non créé.")

except Exception as e_main:
    print(f"ERREUR générale préparation données prétraitées: {e_main}")
    traceback.print_exc()
    raise


# --- Nouvelle Cellule ---

# === Cellule 4: Indexation Prétraitée (Stemming) ===
# Crée l'index Lucene à partir de ap_docs_preprocessed.jsonl (stemmed).
# Utilise l'option --pretokenized.

import os
import subprocess
import traceback
import re # Importer re pour l'analyse de la sortie

# Vérifier que les chemins sont définis
try: CORPUS_DIR; INDEX_DIR_PREPROC
except NameError: print("ERREUR: Variables chemin non définies."); raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source

print(f"Début indexation Prétraitée (Stemming)...")
print(f"Source: {CORPUS_DIR}") # Pyserini prend le dossier
print(f"Fichier JSONL attendu: {JSONL_PREPROC_PATH}")
print(f"Index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe et n'est pas vide
if not os.path.exists(JSONL_PREPROC_PATH) or os.path.getsize(JSONL_PREPROC_PATH) == 0:
    raise FileNotFoundError(f"Fichier prétraité {JSONL_PREPROC_PATH} manquant/vide.")

# Commande Pyserini
index_cmd = [
    "python", "-m", "pyserini.index.lucene", "--collection", "JsonCollection",
    "--input", CORPUS_DIR, "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator", "--threads", "4",
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Option clé ici
]

print(f"Exécution: {' '.join(index_cmd)}")
try:
    # Exécuter la commande d'indexation
    result = subprocess.run(index_cmd, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:])
    print("Sortie STDERR:\n", result.stderr)
    # Essayer d'extraire le nombre de documents indexés
    num_docs_indexed_str = "inconnu"
    match = re.search(r"Total (\d+) documents indexed", result.stdout)
    if match:
        num_docs_indexed_str = match.group(1)

    if "Total 0 documents indexed" in result.stdout: print("\nATTENTION: 0 document indexé.")
    else: print(f"\nIndexation Prétraitée (Stemming) terminée. {num_docs_indexed_str} documents indexés dans {INDEX_DIR_PREPROC}")
except Exception as e:
    # Gérer les erreurs
    print(f"\nERREUR pendant l'indexation Prétraitée: {e}")
    if isinstance(e, subprocess.CalledProcessError): print("STDERR:", e.stderr)
    else: traceback.print_exc()
    raise e

# Vérifier la taille de l'index
print(f"\nVérification taille index: {INDEX_DIR_PREPROC}...")
if os.path.exists(INDEX_DIR_PREPROC):
    du_cmd = f"du -sh '{INDEX_DIR_PREPROC}'" # Commande pour taille dossier
    try: result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True); print(f"  Taille: {result_du.stdout.split()[0]}") # Afficher la taille
    except Exception as e_du: print(f"  Impossible vérifier taille: {e_du}")
else: print("  ATTENTION: Dossier index non créé.")



# --- Nouvelle Cellule ---

# === Cellule 4: Indexation Prétraitée (Stemming) ===
# Crée l'index Lucene à partir de ap_docs_preprocessed.jsonl (stemmed).
# Utilise l'option --pretokenized.

import os
import subprocess
import traceback
import re # Importer re pour l'analyse de la sortie

# Vérifier que les chemins sont définis
try: CORPUS_DIR; INDEX_DIR_PREPROC
except NameError: print("ERREUR: Variables chemin non définies."); raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source

print(f"Début indexation Prétraitée (Stemming)...")
print(f"Source: {CORPUS_DIR}") # Pyserini prend le dossier
print(f"Fichier JSONL attendu: {JSONL_PREPROC_PATH}")
print(f"Index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe et n'est pas vide
if not os.path.exists(JSONL_PREPROC_PATH) or os.path.getsize(JSONL_PREPROC_PATH) == 0:
    raise FileNotFoundError(f"Fichier prétraité {JSONL_PREPROC_PATH} manquant/vide.")

# Commande Pyserini
index_cmd = [
    "python", "-m", "pyserini.index.lucene", "--collection", "JsonCollection",
    "--input", CORPUS_DIR, "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator", "--threads", "4",
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Option clé ici
]

print(f"Exécution: {' '.join(index_cmd)}")
try:
    # Exécuter la commande d'indexation
    result = subprocess.run(index_cmd, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:])
    print("Sortie STDERR:\n", result.stderr)
    # Essayer d'extraire le nombre de documents indexés
    num_docs_indexed_str = "inconnu"
    match = re.search(r"Total (\d+) documents indexed", result.stdout)
    if match:
        num_docs_indexed_str = match.group(1)

    if "Total 0 documents indexed" in result.stdout: print("\nATTENTION: 0 document indexé.")
    else: print(f"\nIndexation Prétraitée (Stemming) terminée. {num_docs_indexed_str} documents indexés dans {INDEX_DIR_PREPROC}")
except Exception as e:
    # Gérer les erreurs
    print(f"\nERREUR pendant l'indexation Prétraitée: {e}")
    if isinstance(e, subprocess.CalledProcessError): print("STDERR:", e.stderr)
    else: traceback.print_exc()
    raise e

# Vérifier la taille de l'index
print(f"\nVérification taille index: {INDEX_DIR_PREPROC}...")
if os.path.exists(INDEX_DIR_PREPROC):
    du_cmd = f"du -sh '{INDEX_DIR_PREPROC}'" # Commande pour taille dossier
    try: result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True); print(f"  Taille: {result_du.stdout.split()[0]}") # Afficher la taille
    except Exception as e_du: print(f"  Impossible vérifier taille: {e_du}")
else: print("  ATTENTION: Dossier index non créé.")



# --- Nouvelle Cellule ---

# === Cellule 5: Exécuter les Recherches (Séquentielles - BM25 & QLD - avec Stemming) ===
# Lance les 8 combinaisons de recherche en utilisant les index baseline et preprocessed (stemming).
# S'assure que l'environnement Java 21 est actif et que les index/variables sont définis.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import JavaException # Importer seulement JavaException

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; CORPUS_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text; # Fonction avec stemming doit être définie
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed (Stemming) manquant: {INDEX_DIR_PREPROC}")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs.jsonl")): raise FileNotFoundError("ap_docs.jsonl manquant.")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")): raise FileNotFoundError("ap_docs_preprocessed.jsonl (stemmed) manquant.")

except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential_qld(queries, index_path, model, k, output_run_file_base, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes (BM25 ou QLD)."""
    start_time = time.time()
    # Ajouter '_stem' au tag et au nom de fichier pour les runs prétraités
    is_preproc_run = (index_path == INDEX_DIR_PREPROC)
    run_tag_suffix = "_stem" if is_preproc_run else ""
    model_suffix = "_stem" if is_preproc_run else "" # Suffixe pour nom de fichier aussi
    run_tag = f"{run_tag_prefix}_{model}{run_tag_suffix}"
    # Construire le nom de fichier de sortie final
    base_filename = f"{run_tag_prefix}_{model}{model_suffix}.txt"
    output_run_file = os.path.join(RUN_DIR, base_filename)

    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")
    print(f"  Fichier de sortie prévu: {output_run_file}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")

        # Configurer similarité
        if model == 'bm25':
            print("  Configuration BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'qld': # Utiliser Query Likelihood Dirichlet
            print("  Configuration QLD..."); searcher.set_qld(); print("  QLD configuré.")
        else:
            print(f"Modèle '{model}' non reconnu, utilise BM25 par défaut."); searcher.set_bm25()

        # Itérer sur les requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # Utiliser la fonction preprocess_text (avec stemming) si nécessaire
                # Note: use_preprocessed_query est False pour les runs preproc car les queries sont déjà stemmatisées
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # S'assurer que le dossier RUN_DIR existe avant d'écrire
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations (BM25 et QLD, avec index stemmatisé pour preproc) ---
# Note: Les noms de fichiers pour les runs preproc incluront '_stem'
print("\n--- DÉBUT DES RECHERCHES BASELINE (BM25/QLD) ---")
run_file_1 = "baseline_short_bm25"; perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = "baseline_short_qld"; perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = "baseline_long_bm25"; perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = "baseline_long_qld"; perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES (STEMMING - BM25/QLD) ---")
# Utiliser les requêtes prétraitées (stemmed) et l'index preproc (stemmed)
# Les noms de fichiers de sortie incluront '_stem' grâce à la logique dans la fonction
run_file_5 = "preproc_short_bm25_stem"; perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = "preproc_short_qld_stem"; perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = "preproc_long_bm25_stem"; perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = "preproc_long_qld_stem"; perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches (Baseline et Stemming) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}



# --- Nouvelle Cellule ---

# === Cellule 5b: Exécuter les Recherches en Parallèle (BM25 & QLD - avec Stemming) ===
# Lance les 8 combinaisons de recherche en parallèle en utilisant multiprocessing.Pool.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm # Pour la barre de progression
import traceback
import os
from jnius import JavaException # Importer seulement JavaException
from multiprocessing import Pool, cpu_count # Importer pour la parallélisation
import math # Pour calculer la taille des chunks

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; CORPUS_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text; # Fonction avec stemming doit être définie
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed (Stemming) manquant: {INDEX_DIR_PREPROC}")
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

# --- Fonction Worker pour la parallélisation ---
def perform_search_single_query_parallel(args):
    """Fonction exécutée par chaque processus pour une seule requête."""
    query_id, query_text, index_path, model, k, run_tag_prefix, use_preprocessed_query, is_preproc_run = args
    # Note: run_tag_prefix est passé, le tag complet est construit ici

    # Construire le tag final à l'intérieur du worker
    run_tag_suffix = "_stem" if is_preproc_run else ""
    final_run_tag = f"{run_tag_prefix}_{model}{run_tag_suffix}" # Ex: baseline_short_bm25 ou preproc_long_qld_stem

    try:
        # Initialiser le searcher DANS le processus fils pour éviter problèmes de sérialisation/JVM
        searcher = LuceneSearcher(index_path)

        # Configurer le modèle de similarité
        if model == 'bm25':
            searcher.set_bm25(k1=0.9, b=0.4)
        elif model == 'qld':
            searcher.set_qld()
        else:
            searcher.set_bm25() # Défaut

        # Prétraiter la requête si nécessaire (uniquement pour les runs baseline ici)
        search_text = preprocess_text(query_text) if use_preprocessed_query else query_text

        # Gérer les requêtes vides après traitement
        if not search_text.strip():
            return [] # Retourner une liste vide si la requête est vide

        # Exécuter la recherche
        hits = searcher.search(search_text, k=k)

        # Formater les résultats pour cette requête
        query_results = []
        for i in range(len(hits)):
            rank = i + 1
            doc_id = hits[i].docid
            score = hits[i].score
            if doc_id is None: continue # Ignorer si docid est None
            # Format TREC: qid Q0 docid rank score run_tag
            query_results.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {final_run_tag}\n")

        # Libérer explicitement le searcher (peut aider à gérer les ressources Java)
        del searcher
        return query_results

    except Exception as e:
        # Afficher l'erreur mais ne pas faire planter tout le pool
        print(f"\nERREUR dans worker pour QID {query_id} ({final_run_tag}): {e}")
        # print(traceback.format_exc()) # Décommenter pour trace complète si besoin
        return [] # Retourne une liste vide en cas d'erreur


# --- Fonction Principale pour lancer une recherche parallèle ---
def run_search_parallel_qld(queries, index_path, model, k, run_tag_prefix_arg, use_preprocessed_query=False):
    """Exécute la recherche en parallèle pour un ensemble de requêtes."""
    # Note: Renommé run_tag_prefix en run_tag_prefix_arg pour éviter conflit avec variable externe si nécessaire
    start_time = time.time()
    # Déterminer si c'est un run prétraité pour le tag et nom de fichier
    is_preproc_run = (index_path == INDEX_DIR_PREPROC)
    model_suffix = "_stem" if is_preproc_run else ""
    base_filename = f"{run_tag_prefix_arg}_{model}{model_suffix}.txt"
    output_run_file = os.path.join(RUN_DIR, base_filename)
    # Le tag complet pour affichage et description tqdm
    full_run_tag_display = f"{run_tag_prefix_arg}_{model}{model_suffix}"

    print(f"\nDébut recherche PARALLÈLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{full_run_tag_display}', k={k}")
    print(f"  Fichier de sortie prévu: {output_run_file}")

    # Préparer les arguments pour chaque tâche
    tasks = []
    for query_id, query_text in queries.items():
        # Passer run_tag_prefix_arg au worker
        tasks.append((query_id, query_text, index_path, model, k, run_tag_prefix_arg, use_preprocessed_query, is_preproc_run))

    # Définir le nombre de workers
    num_workers = 4
    print(f"  Utilisation de {num_workers} processus parallèles...")

    all_results_list = []
    # Utiliser Pool pour la parallélisation
    try:
        # Calculer chunksize pour tqdm
        chunksize = math.ceil(len(tasks) / num_workers / 4)
        if chunksize == 0: chunksize = 1

        with Pool(num_workers) as pool:
           results_iterator = pool.imap_unordered(perform_search_single_query_parallel, tasks, chunksize=chunksize)
           # Envelopper avec tqdm
           for result in tqdm(results_iterator, total=len(tasks), desc=f"Recherche {full_run_tag_display}"):
               if result: all_results_list.extend(result)

        print(f"\n  {len(all_results_list)} lignes de résultats collectées.")

        # Écrire les résultats dans le fichier de run TREC
        if all_results_list:
            os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
            with open(output_run_file, 'w', encoding='utf-8') as f_out:
               f_out.writelines(all_results_list)
            print(f"  Résultats sauvegardés dans {output_run_file}")
        else:
            print("  Avertissement: Aucun résultat collecté pour ce run parallèle.")

    except Exception as e_pool:
        print(f"\nERREUR MAJEURE pendant l'exécution parallèle de {full_run_tag_display}: {e_pool}")
        print(traceback.format_exc())
    finally:
        pass # Pool est fermé par 'with'

    end_time = time.time()
    print(f"Recherche PARALLÈLE terminée pour {full_run_tag_display}.")
    print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")


# --- Exécution des 8 configurations en parallèle ---
# Note: Utiliser run_search_parallel_qld (nom corrigé de la fonction principale)

print("\n--- DÉBUT DES RECHERCHES PARALLÈLES BASELINE (BM25/QLD) ---")
# Les variables run_tag_X ne sont plus utilisées, le nom de fichier est construit dans la fonction
run_search_parallel_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, "baseline_short")
run_search_parallel_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, "baseline_short")
run_search_parallel_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, "baseline_long")
run_search_parallel_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, "baseline_long")

print("\n--- DÉBUT DES RECHERCHES PARALLÈLES PRÉTRAITÉES (STEMMING - BM25/QLD) ---")
# Les noms de fichiers incluront '_stem' automatiquement car INDEX_DIR_PREPROC est utilisé
run_search_parallel_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, "preproc_short", use_preprocessed_query=False)
run_search_parallel_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, "preproc_short", use_preprocessed_query=False)
run_search_parallel_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, "preproc_long", use_preprocessed_query=False)
run_search_parallel_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, "preproc_long", use_preprocessed_query=False)

print("\n--- Toutes les recherches parallèles (Baseline et Stemming) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches parallèles...")
# Utiliser '!' pour exécuter ls
!ls -l {RUN_DIR}



# --- Nouvelle Cellule ---

# === Cellule 5: Exécuter les Recherches en SÉQUENTIEL (BM25 & QLD - avec Stemming) ===
# Lance les 8 combinaisons de recherche séquentiellement. Plus lent en théorie, souvent plus rapide/stable en pratique sur Colab.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import JavaException # Importer seulement JavaException

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; CORPUS_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text; # Fonction avec stemming doit être définie
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed (Stemming) manquant: {INDEX_DIR_PREPROC}")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs.jsonl")): raise FileNotFoundError("ap_docs.jsonl manquant.")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")): raise FileNotFoundError("ap_docs_preprocessed.jsonl (stemmed) manquant.")

except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential_qld(queries, index_path, model, k, output_run_file_base, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes (BM25 ou QLD)."""
    start_time = time.time()
    # Ajouter '_stem' au tag et au nom de fichier pour les runs prétraités
    is_preproc_run = (index_path == INDEX_DIR_PREPROC)
    run_tag_suffix = "_stem" if is_preproc_run else ""
    model_suffix = "_stem" if is_preproc_run else "" # Suffixe pour nom de fichier aussi
    run_tag = f"{run_tag_prefix}_{model}{run_tag_suffix}"
    # Construire le nom de fichier de sortie final
    # Note: output_run_file_base n'est plus utilisé, on utilise run_tag_prefix directement
    base_filename = f"{run_tag_prefix}_{model}{model_suffix}.txt"
    output_run_file = os.path.join(RUN_DIR, base_filename)

    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")
    print(f"  Fichier de sortie prévu: {output_run_file}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")

        # Configurer similarité
        if model == 'bm25':
            print("  Configuration BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'qld': # Utiliser Query Likelihood Dirichlet
            print("  Configuration QLD..."); searcher.set_qld(); print("  QLD configuré.")
        else:
            print(f"Modèle '{model}' non reconnu, utilise BM25 par défaut."); searcher.set_bm25()

        # Itérer sur les requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # Utiliser la fonction preprocess_text (avec stemming) si nécessaire
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # S'assurer que le dossier RUN_DIR existe avant d'écrire
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations (BM25 et QLD, avec index stemmatisé pour preproc) ---
print("\n--- DÉBUT DES RECHERCHES BASELINE (BM25/QLD) ---")
# Passer le préfixe du tag directement à la fonction
perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, "baseline_short", "baseline_short")
perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, "baseline_short", "baseline_short")
perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, "baseline_long", "baseline_long")
perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, "baseline_long", "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES (STEMMING - BM25/QLD) ---")
# Les noms de fichiers incluront '_stem' automatiquement car INDEX_DIR_PREPROC est utilisé
perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, "preproc_short", "preproc_short", use_preprocessed_query=False)
perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, "preproc_short", "preproc_short", use_preprocessed_query=False)
perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, "preproc_long", "preproc_long", use_preprocessed_query=False)
perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, "preproc_long", "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches (Baseline et Stemming) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}



# --- Nouvelle Cellule ---

# === Cellule 6: Évaluation des Runs (Stemming vs Baseline) ===
# Lit les fichiers Qrels, lit les fichiers de résultats (.txt) du dossier RUN_DIR,
# calcule MAP et P@10, et affiche/sauvegarde les tableaux récapitulatifs.
# Devrait maintenant évaluer les runs baseline (BM25/QLD) et preproc_stem (BM25/QLD).

import pandas as pd
import glob
import pytrec_eval
import os
import traceback

# Vérifier que les chemins sont définis
try:
    QRELS_DIR
    RUN_DIR
    EVAL_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Préparation Qrels depuis: {QRELS_DIR}")
qrels_files = sorted(glob.glob(os.path.join(QRELS_DIR, "qrels.*.txt")))
if not qrels_files: print(f"ATTENTION: Aucun fichier Qrels trouvé dans {QRELS_DIR}."); qrels_dict = {}
else:
    print(f"Fichiers Qrels trouvés: {qrels_files}")
    all_qrels_data = []
    for qf in qrels_files:
        try:
            # Lire le fichier qrels en spécifiant les types pour éviter les erreurs
            qrels_df = pd.read_csv(qf, sep='\s+', names=['query_id', 'unused', 'doc_id', 'relevance'],
                                   dtype={'query_id': str, 'unused': str, 'doc_id': str, 'relevance': int})
            all_qrels_data.append(qrels_df[['query_id', 'doc_id', 'relevance']])
        except Exception as e: print(f"Erreur lecture Qrels {qf}: {e}")
    if not all_qrels_data: print("ERREUR: Impossible lire données Qrels."); qrels_dict = {}
    else:
        combined_qrels_df = pd.concat(all_qrels_data, ignore_index=True)
        qrels_dict = {}
        # Convertir le DataFrame en dictionnaire attendu par pytrec_eval
        for _, row in combined_qrels_df.iterrows():
            qid, did, rel = str(row['query_id']), str(row['doc_id']), int(row['relevance'])
            if rel < 0: continue # Ignorer jugements négatifs
            if qid not in qrels_dict: qrels_dict[qid] = {}
            qrels_dict[qid][did] = rel
        print(f"Total {len(qrels_dict)} requêtes avec jugements chargées.")

# --- Évaluation des Runs ---
if not qrels_dict: print("\nAucun jugement de pertinence chargé, impossible d'évaluer.")
else:
    measures = {'map', 'P_10'} # Métriques à calculer
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, measures) # Initialiser l'évaluateur
    # Trouver tous les fichiers .txt dans le dossier des runs
    run_files = sorted(glob.glob(os.path.join(RUN_DIR, "*.txt")))
    print(f"\n{len(run_files)} fichiers de run à évaluer trouvés dans {RUN_DIR}.")
    print(f"  Fichiers: {[os.path.basename(f) for f in run_files]}") # Afficher les noms

    results_summary = [] # Liste pour stocker les résultats agrégés
    if not run_files: print(f"ATTENTION: Aucun fichier de run (.txt) trouvé dans {RUN_DIR}.")
    else:
        # Boucler sur chaque fichier de run trouvé
        for run_file in run_files:
            run_name = os.path.basename(run_file)
            # Ignorer les anciens runs RM3 s'ils existent encore (basés sur lemmatisation)
            # Ajustement: Ignorer tout run RM3 qui ne se termine PAS par _rm3_stem.txt
            # Cela permet d'ignorer un éventuel preproc_long_bm25_rm3.txt de l'essai précédent.
            if 'rm3' in run_name and not run_name.endswith('_rm3_stem.txt'):
                 print(f"\n--- Ignorer ancien run RM3 (non-stem): {run_name} ---")
                 continue

            print(f"\n--- Évaluation: {run_name} ---")
            run_dict = {} # Dictionnaire pour stocker les résultats de ce run
            error_count = 0
            line_count = 0
            try:
                # Lire le fichier run ligne par ligne
                with open(run_file, 'r', encoding='utf-8') as f_run:
                    for line in f_run:
                        line_count += 1
                        parts = line.strip().split()
                        # Vérifier le format TREC (6 colonnes)
                        if len(parts) != 6: error_count += 1; continue
                        qid, _, did, _, score, _ = parts # Extraire les infos utiles
                        try: score = float(score) # Convertir le score en float
                        except ValueError: error_count += 1; continue
                        qid = str(qid) # Assurer que qid est une chaîne
                        # Stocker le score pour ce document et cette requête
                        if qid not in run_dict: run_dict[qid] = {}
                        run_dict[qid][did] = score
                if error_count > 0: print(f"  Avertissement: {error_count} lignes mal formatées ignorées sur {line_count} lignes.")

                # Filtrer le run pour ne garder que les requêtes présentes dans les Qrels
                filtered_run_dict = {qid: docs for qid, docs in run_dict.items() if qid in qrels_dict}
                ignored_q = len(run_dict) - len(filtered_run_dict)
                if ignored_q > 0: print(f"  Avertissement: {ignored_q} requêtes run ignorées (absentes Qrels).") # Normal, basé sur 51 qrels
                if not filtered_run_dict: print("  Erreur: Aucune requête ne correspond aux Qrels."); continue

                # Évaluer le run filtré avec pytrec_eval
                eval_results = evaluator.evaluate(filtered_run_dict)
                # Calculer les moyennes des métriques sur toutes les requêtes évaluées
                all_maps = [q_res.get("map", 0) for q_res in eval_results.values()]
                all_p10s = [q_res.get("P_10", 0) for q_res in eval_results.values()]
                avg_map = sum(all_maps) / len(all_maps) if all_maps else 0
                avg_p10 = sum(all_p10s) / len(all_p10s) if all_p10s else 0

                # Afficher les résultats moyens pour ce run
                print(f"  MAP: {avg_map:.4f}")
                print(f"  P@10: {avg_p10:.4f}")

                # Extraire les informations du nom de fichier pour le résumé
                run_name_parts = run_name.replace('.txt','')
                parts = run_name_parts.split('_')
                if len(parts) >= 3:
                    index_type = parts[0] # baseline ou preproc
                    query_type = parts[1] # short ou long
                    # Gérer les suffixes _stem et _rm3
                    model_parts = parts[2:]
                    model_suffix = ""
                    # Vérifier si le dernier élément est 'rm3'
                    if model_parts[-1] == 'rm3':
                        model_suffix += "+RM3"
                        model_parts = model_parts[:-1] # Enlever 'rm3'
                    # Vérifier si le dernier élément (restant) est 'stem'
                    if model_parts and model_parts[-1] == 'stem':
                        # On note que c'est stem via index_type='preproc', on enlève 'stem' du nom modèle
                        model_parts = model_parts[:-1]

                    model_type = "_".join(model_parts) # Devrait être BM25 ou QLD

                    # Utiliser 'Preprocessed (Stem)' comme nom d'index pour plus de clarté
                    display_index_type = "Preprocessed (Stem)" if index_type == "preproc" else "Baseline"

                    results_summary.append({
                        "Run Name": run_name, "Index": display_index_type, # Utiliser le nom plus clair
                        "Query Type": query_type.capitalize(),
                        "Weighting Scheme": model_type.upper() + model_suffix, # Ex: BM25, QLD, BM25+RM3
                        "MAP": avg_map, "P@10": avg_p10
                    })
                else: print(f"  Avertissement: Impossible parser nom run '{run_name}'.")

            except FileNotFoundError: print(f"  Erreur: Fichier run non trouvé: {run_file}")
            except Exception as e: print(f"  Erreur évaluation {run_name}: {e}"); traceback.print_exc()

        # Afficher et sauvegarder le résumé final
        if results_summary:
            print("\n\n=== Tableau Récapitulatif (Stemming vs Baseline) ===")
            results_df = pd.DataFrame(results_summary)
            # Trier pour une meilleure lisibilité
            results_df = results_df.sort_values(by=["Index", "Query Type", "Weighting Scheme"])

            # Afficher le DataFrame complet
            print("\n--- Résultats Complets ---")
            print(results_df.to_markdown(index=False, floatfmt=".4f"))

            # Essayer d'afficher les tableaux pivots
            try:
                pivot_map = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='MAP')
                print("\n--- MAP (Tableau Pivot) ---")
                print(pivot_map.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot MAP: {e_pivot})")

            try:
                pivot_p10 = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='P@10')
                print("\n--- P@10 (Tableau Pivot) ---")
                print(pivot_p10.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot P@10: {e_pivot})")

            # Sauvegarder le DataFrame complet final avec un nom spécifique pour le stemming
            summary_file_path = os.path.join(EVAL_DIR, "evaluation_summary_stemming_final.csv")
            try:
                 results_df.to_csv(summary_file_path, index=False)
                 print(f"\nTableau récapitulatif complet sauvegardé: {summary_file_path}")
            except Exception as e_save: print(f"\nErreur sauvegarde résumé: {e_save}")
        else: print("\nAucun résultat d'évaluation à afficher.")



# --- Nouvelle Cellule ---

# === Cellule de Vérification des Dossiers d'Index ===
# Utilise les commandes shell de Colab préfixées par '!'

import os # Importer os pour définir les variables si besoin

# Définir les chemins au cas où ils ne seraient pas dans l'environnement
# (Normalement définis par la cellule de configuration complète)
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")

print("--- Contenu Index Baseline ---")
# Utiliser '!' pour exécuter ls -lh sur le dossier baseline
# Mettre le chemin entre guillemets pour gérer les espaces potentiels
!ls -lh "{INDEX_DIR_BASELINE}"

print("\n--- Contenu Index Preprocessed (Stemming) ---")
# Utiliser '!' pour exécuter ls -lh sur le dossier preprocessed
!ls -lh "{INDEX_DIR_PREPROC}"

# Ajouter une vérification d'existence pour être plus clair
print("\n--- Vérification d'existence ---")
if os.path.exists(INDEX_DIR_BASELINE):
    print(f"Le dossier {INDEX_DIR_BASELINE} existe.")
else:
    print(f"ATTENTION: Le dossier {INDEX_DIR_BASELINE} N'EXISTE PAS.")

if os.path.exists(INDEX_DIR_PREPROC):
    print(f"Le dossier {INDEX_DIR_PREPROC} existe.")
else:
    print(f"ATTENTION: Le dossier {INDEX_DIR_PREPROC} N'EXISTE PAS.")



# --- Nouvelle Cellule ---

!ls -ld /content/ap_output


# --- Nouvelle Cellule ---

# === Cellule 1: Extraire, Décompresser et Formater les Documents ===
# Lit AP.tar, décompresse les .gz internes, extrait <DOC>, <DOCNO>, <TEXT>
# et écrit le résultat dans ap_docs.jsonl.

import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    AP_TAR_PATH
    CORPUS_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.") # Devrait être ~275Mo

# Regex pour extraire les infos
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteurs
doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

try:
    # Ouvrir le fichier de sortie et l'archive TAR
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.") # Devrait être ~1051

        # Boucler sur chaque membre de l'archive
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer si ce n'est pas un fichier .gz ou .Z
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Réinitialiser pour chaque fichier

            try:
                # Extraire le contenu compressé
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # Décompresser le contenu
                    try:
                        content_bytes = gzip.decompress(compressed_content)
                        content = content_bytes.decode('utf-8', errors='ignore') # Décoder après décompression
                    except gzip.BadGzipFile: # Gérer si ce n'est pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au suivant

                    # Trouver tous les blocs <DOC> dans le contenu décompressé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches: continue # Passer si aucun doc trouvé

                    # Boucler sur chaque document trouvé
                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match: continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        # Nettoyer le texte (espaces multiples)
                        doc_text = ' '.join(text_match.group(1).strip().split()) if text_match else ""

                        # Écrire la ligne JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key: print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}"); skipped_members += 1
            except EOFError: print(f"\nAvertissement: Fin fichier inattendue {member.name}."); skipped_members += 1
            except Exception as e_extract: print(f"\nErreur extraction/lecture {member.name}: {e_extract}"); skipped_members += 1

except tarfile.ReadError as e_tar: print(f"\nERREUR lecture TAR {AP_TAR_PATH}: {e_tar}"); raise e_tar
except FileNotFoundError: print(f"\nERREUR: Fichier TAR {AP_TAR_PATH} non trouvé."); raise FileNotFoundError
except Exception as e_general: print(f"\nERREUR générale traitement TAR: {e_general}"); traceback.print_exc(); raise e_general

# Afficher le résumé de l'extraction
print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0: print(f"  {decompression_errors} erreurs/avertissements décompression.")
print(f"  {doc_count} documents écrits dans {JSONL_OUTPUT_PATH}") # Devrait être ~240k

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale {JSONL_OUTPUT_PATH}: {output_size} octets.") # Devrait être ~600Mo
    if output_size > 0 and doc_count > 0: print("  SUCCÈS: Fichier de sortie contient des données.")
    else: print("  ATTENTION: Fichier de sortie vide ou aucun document écrit.")
else: print(f"  ATTENTION: Fichier {JSONL_OUTPUT_PATH} non créé.")



# --- Nouvelle Cellule ---

# === Sauvegarde du dossier /content/ap_output vers Google Drive ===
import os
import subprocess
import time

# Chemin de base du projet sur Drive (vérifiez qu'il est correct)
# Devrait être défini par la cellule de configuration complète
try:
    DRIVE_PROJECT_PATH
except NameError:
    print("ERREUR: La variable DRIVE_PROJECT_PATH n'est pas définie. Exécutez d'abord la cellule de configuration complète.")
    # Optionnel: Redéfinir ici si nécessaire, mais il vaut mieux exécuter la cellule de setup
    # DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC"
    raise

# Dossier source dans Colab à sauvegarder
SOURCE_DIR_TO_SAVE = "/content/ap_output"

# Dossier cible sur Google Drive pour la sauvegarde
# (Crée un sous-dossier 'colab_output_backup' dans votre dossier projet TREC)
TARGET_BACKUP_DIR_ON_DRIVE = os.path.join(DRIVE_PROJECT_PATH, "colab_output_backup")

print(f"Source à sauvegarder : {SOURCE_DIR_TO_SAVE}")
print(f"Cible sur Drive : {TARGET_BACKUP_DIR_ON_DRIVE}")

# Vérifier si le dossier source existe
if os.path.exists(SOURCE_DIR_TO_SAVE):
    # Créer le dossier cible sur Drive s'il n'existe pas
    print(f"Création (si nécessaire) du dossier cible: {TARGET_BACKUP_DIR_ON_DRIVE}")
    os.makedirs(TARGET_BACKUP_DIR_ON_DRIVE, exist_ok=True)

    print("\nCopie des fichiers vers Google Drive en cours... (Cela peut prendre plusieurs minutes)")
    # Utiliser cp -r (récursif), -u (update: copie seulement si plus récent ou manquant), -v (verbeux)
    # Copie le contenu de SOURCE_DIR_TO_SAVE dans TARGET_BACKUP_DIR_ON_DRIVE
    # L'option -u évite de recopier inutilement les gros index s'ils n'ont pas changé,
    # mais écrase les fichiers runs/ et eval/ s'ils existaient déjà avec le même nom.
    copy_cmd = f"cp -r -u -v '{SOURCE_DIR_TO_SAVE}/.' '{TARGET_BACKUP_DIR_ON_DRIVE}/'"
    try:
        # Exécuter la commande de copie
        process = subprocess.run(copy_cmd, shell=True, check=True, capture_output=True, text=True, timeout=900) # Timeout 15 minutes
        # Afficher la fin de la sortie pour confirmation
        print("... (Sortie de la copie, peut être longue) ...")
        # stderr affiche souvent les fichiers copiés avec -v
        stderr_lines = process.stderr.splitlines()
        print("\nExtrait de la fin de la sortie STDERR (fichiers copiés):")
        for line in stderr_lines[-20:]: # Afficher les 20 dernières lignes
             print(line)

        print("\nSauvegarde terminée avec succès !")
        print(f"Le contenu de {SOURCE_DIR_TO_SAVE} a été copié/mis à jour dans {TARGET_BACKUP_DIR_ON_DRIVE}")
        # Vérifier rapidement si les dossiers runs et eval existent dans la sauvegarde
        print("\nVérification de la sauvegarde sur Drive (partiel):")
        print(f"Contenu de {TARGET_BACKUP_DIR_ON_DRIVE}/runs :")
        !ls -l "{TARGET_BACKUP_DIR_ON_DRIVE}/runs" | head -n 10
        print(f"\nContenu de {TARGET_BACKUP_DIR_ON_DRIVE}/eval :")
        !ls -l "{TARGET_BACKUP_DIR_ON_DRIVE}/eval"
    except subprocess.CalledProcessError as e:
         print(f"\nERREUR lors de la sauvegarde (code {e.returncode}).")
         print("STDOUT:", e.stdout)
         print("STDERR:", e.stderr)
         print("\nVérifiez que vous avez les permissions d'écriture sur Google Drive.")
    except subprocess.TimeoutExpired as e:
        print(f"\nERREUR: La sauvegarde a dépassé le délai d'attente.")
    except Exception as e:
        print(f"\nERREUR inattendue lors de la sauvegarde: {e}")
else:
    print(f"Le dossier source {SOURCE_DIR_TO_SAVE} n'existe pas, aucune sauvegarde effectuée. Avez-vous exécuté la configuration complète ?")




# --- Nouvelle Cellule ---

# === Cellule 7: Exécuter la Recherche Améliorée (RM3) ===
# Applique RM3 sur la meilleure configuration de base identifiée à l'étape 6 (Stemming vs Baseline).
# !! N'OUBLIEZ PAS DE CONFIGURER LES VARIABLES BEST_... CI-DESSOUS !!

from pyserini.search.lucene import LuceneSearcher
from jnius import autoclass, JavaException
from tqdm.notebook import tqdm
import time
import traceback
import os
import re # Importer re pour l'analyse de sortie éventuelle

# Vérifier variables nécessaires
try: INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; EVAL_DIR; queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed; preprocess_text;
except NameError as e: print(f"ERREUR: Variable {e} manquante."); raise

# --- À CONFIGURER selon vos meilleurs résultats de l'Étape 6 (Stemming vs Baseline) ---
print("--- Configuration RM3 ---")
print("Veuillez éditer les variables BEST_... ci-dessous en fonction de vos meilleurs résultats MAP de l'étape précédente (évaluation avec stemming).")
# Exemple: si baseline_long_bm25 était le meilleur (MAP ~0.2205)
BEST_INDEX_PATH = INDEX_DIR_BASELINE           # Mettez INDEX_DIR_BASELINE ou INDEX_DIR_PREPROC
BEST_QUERIES = queries_long                  # Mettez queries_short, queries_long, queries_short_preprocessed, ou queries_long_preprocessed
BEST_MODEL_BASE = 'bm25'                      # Mettez 'bm25' ou 'qld'
BEST_RUN_TAG_PREFIX = "baseline_long"          # Mettez 'baseline_short', 'baseline_long', 'preproc_short', ou 'preproc_long'
USE_PREPROC_QUERY_FOR_RM3 = False             # Mettez False si index baseline ou si BEST_QUERIES est _preprocessed, True si index preproc ET BEST_QUERIES est brut (peu courant)
# ----------------------------------------------------------------
print(f"Configuration choisie pour RM3:")
print(f"  Index: {os.path.basename(BEST_INDEX_PATH)}")
print(f"  Modèle Base: {BEST_MODEL_BASE}")
print(f"  Préfixe Tag: {BEST_RUN_TAG_PREFIX}")

# Nom du fichier et tag pour le run RM3
# Ajouter '_stem' au tag et nom de fichier si basé sur l'index preproc (stemming)
run_tag_suffix_rm3 = "_stem" if (BEST_INDEX_PATH == INDEX_DIR_PREPROC) else ""
PRF_RUN_FILE = os.path.join(RUN_DIR, f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3{run_tag_suffix_rm3}.txt")
RM3_RUN_TAG = f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3{run_tag_suffix_rm3}"
print(f"  Run Tag: {RM3_RUN_TAG}")
print(f"  Fichier de sortie prévu: {PRF_RUN_FILE}")


# Paramètres RM3 (standards)
rm3_config = {'fb_terms': 10, 'fb_docs': 10, 'original_query_weight': 0.5}
print(f"  Paramètres RM3: {rm3_config}")

# --- Fonction de recherche RM3 (séquentielle) ---
def perform_search_sequential_rm3(queries, index_path, model_base, k, output_run_file, run_tag, use_preprocessed_query=False, rm3_params=None):
    """Exécute la recherche RM3 séquentiellement."""
    start_time = time.time(); print(f"\nDébut recherche RM3: Modèle='{model_base}+RM3', Tag='{run_tag}', k={k}")
    all_results_list = []; searcher = None
    try:
        print(f"  Init LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  Init OK.")
        # Configurer similarité base
        if model_base == 'bm25': print("  Config BM25 (base)..."); searcher.set_bm25(k1=0.9, b=0.4)
        elif model_base == 'qld': print("  Config QLD (base)..."); searcher.set_qld()
        else: print(f"Modèle base '{model_base}' non reconnu, utilise BM25."); searcher.set_bm25()
        # Activer RM3
        print("  Activation RM3..."); searcher.set_rm3(**rm3_params); print("  RM3 activé.")
        # Itérer sur requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # Prétraitement de la requête seulement si nécessaire
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue
                hits = searcher.search(search_text, k=k)
                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query: query_errors += 1; # Limiter affichage

        # --- CORRECTION SYNTAXE ICI ---
        # Écrire résultats (avec indentation correcte)
        if all_results_list:
            # S'assurer que le dossier RUN_DIR existe avant d'écrire
            os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
            # Ouvrir le fichier et écrire les lignes
            with open(output_run_file, 'w', encoding='utf-8') as f_out:
                f_out.writelines(all_results_list)
            # Afficher confirmation
            print(f"\n  {len(all_results_list)} lignes résultats écrites: {os.path.basename(output_run_file)}.")
        else:
            print("\n  Avertissement: Aucun résultat RM3 généré.")
        # --- FIN CORRECTION ---

        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs requêtes.")
        end_time = time.time(); print(f"Recherche RM3 terminée {run_tag}. Temps: {end_time - start_time:.2f}s.")

    except Exception as e_main: print(f"\nERREUR MAJEURE run RM3 {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite {run_tag}.")

# Lancer la recherche RM3
print("\nLancement de la recherche RM3...")
perform_search_sequential_rm3( BEST_QUERIES, BEST_INDEX_PATH, BEST_MODEL_BASE, K_RESULTS, PRF_RUN_FILE, RM3_RUN_TAG, use_preprocessed_query=USE_PREPROC_QUERY_FOR_RM3, rm3_params=rm3_config)

print("\n--- Exécution recherche RM3 terminée. ---")
# Vérifier création fichier
print(f"\nVérification création fichier {PRF_RUN_FILE}...")
# Utiliser !ls avec des guillemets pour gérer les chemins
!ls -l "{PRF_RUN_FILE}"


# --- Nouvelle Cellule ---

# === Cellule 8: Évaluation Finale (Tous les Runs) ===
# Lit les fichiers Qrels, lit TOUS les fichiers de résultats (.txt) du dossier RUN_DIR,
# calcule MAP et P@10, et affiche/sauvegarde les tableaux récapitulatifs finaux.

import pandas as pd
import glob
import pytrec_eval
import os
import traceback
import re # Importer re pour parser les noms de run

# Vérifier que les chemins sont définis
try:
    QRELS_DIR
    RUN_DIR
    EVAL_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Préparation Qrels depuis: {QRELS_DIR}")
qrels_files = sorted(glob.glob(os.path.join(QRELS_DIR, "qrels.*.txt")))
if not qrels_files: print(f"ATTENTION: Aucun fichier Qrels trouvé dans {QRELS_DIR}."); qrels_dict = {}
else:
    print(f"Fichiers Qrels trouvés: {qrels_files}")
    all_qrels_data = []
    for qf in qrels_files:
        try:
            # Lire le fichier qrels en spécifiant les types pour éviter les erreurs
            qrels_df = pd.read_csv(qf, sep='\s+', names=['query_id', 'unused', 'doc_id', 'relevance'],
                                   dtype={'query_id': str, 'unused': str, 'doc_id': str, 'relevance': int})
            all_qrels_data.append(qrels_df[['query_id', 'doc_id', 'relevance']])
        except Exception as e: print(f"Erreur lecture Qrels {qf}: {e}")
    if not all_qrels_data: print("ERREUR: Impossible lire données Qrels."); qrels_dict = {}
    else:
        combined_qrels_df = pd.concat(all_qrels_data, ignore_index=True)
        qrels_dict = {}
        # Convertir le DataFrame en dictionnaire attendu par pytrec_eval
        for _, row in combined_qrels_df.iterrows():
            qid, did, rel = str(row['query_id']), str(row['doc_id']), int(row['relevance'])
            if rel < 0: continue # Ignorer jugements négatifs
            if qid not in qrels_dict: qrels_dict[qid] = {}
            qrels_dict[qid][did] = rel
        print(f"Total {len(qrels_dict)} requêtes avec jugements chargées.")

# --- Évaluation des Runs ---
if not qrels_dict: print("\nAucun jugement de pertinence chargé, impossible d'évaluer.")
else:
    measures = {'map', 'P_10'} # Métriques à calculer
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, measures) # Initialiser l'évaluateur
    # Trouver tous les fichiers .txt dans le dossier des runs
    run_files = sorted(glob.glob(os.path.join(RUN_DIR, "*.txt")))
    print(f"\n{len(run_files)} fichiers de run à évaluer trouvés dans {RUN_DIR}.") # Devrait être 9 maintenant
    print(f"  Fichiers: {[os.path.basename(f) for f in run_files]}") # Afficher les noms

    results_summary = [] # Liste pour stocker les résultats agrégés
    if not run_files: print(f"ATTENTION: Aucun fichier de run (.txt) trouvé dans {RUN_DIR}.")
    else:
        # Boucler sur chaque fichier de run trouvé
        for run_file in run_files:
            run_name = os.path.basename(run_file)
            print(f"\n--- Évaluation: {run_name} ---")
            run_dict = {} # Dictionnaire pour stocker les résultats de ce run
            error_count = 0
            line_count = 0
            try:
                # Lire le fichier run ligne par ligne
                with open(run_file, 'r', encoding='utf-8') as f_run:
                    for line in f_run:
                        line_count += 1
                        parts = line.strip().split()
                        # Vérifier le format TREC (6 colonnes)
                        if len(parts) != 6: error_count += 1; continue
                        qid, _, did, _, score, _ = parts # Extraire les infos utiles
                        try: score = float(score) # Convertir le score en float
                        except ValueError: error_count += 1; continue
                        qid = str(qid) # Assurer que qid est une chaîne
                        # Stocker le score pour ce document et cette requête
                        if qid not in run_dict: run_dict[qid] = {}
                        run_dict[qid][did] = score
                if error_count > 0: print(f"  Avertissement: {error_count} lignes mal formatées ignorées sur {line_count} lignes.")

                # Filtrer le run pour ne garder que les requêtes présentes dans les Qrels
                filtered_run_dict = {qid: docs for qid, docs in run_dict.items() if qid in qrels_dict}
                ignored_q = len(run_dict) - len(filtered_run_dict)
                if ignored_q > 0: print(f"  Avertissement: {ignored_q} requêtes run ignorées (absentes Qrels).") # Normal
                if not filtered_run_dict: print("  Erreur: Aucune requête ne correspond aux Qrels."); continue

                # Évaluer le run filtré avec pytrec_eval
                eval_results = evaluator.evaluate(filtered_run_dict)
                # Calculer les moyennes des métriques sur toutes les requêtes évaluées
                all_maps = [q_res.get("map", 0) for q_res in eval_results.values()]
                all_p10s = [q_res.get("P_10", 0) for q_res in eval_results.values()]
                avg_map = sum(all_maps) / len(all_maps) if all_maps else 0
                avg_p10 = sum(all_p10s) / len(all_p10s) if all_p10s else 0

                # Afficher les résultats moyens pour ce run
                print(f"  MAP: {avg_map:.4f}")
                print(f"  P@10: {avg_p10:.4f}")

                # Extraire les informations du nom de fichier pour le résumé
                run_name_parts = run_name.replace('.txt','')
                parts = run_name_parts.split('_')
                if len(parts) >= 3:
                    index_type_raw = parts[0] # baseline ou preproc
                    query_type = parts[1] # short ou long
                    # Gérer les suffixes _stem et _rm3
                    model_parts = parts[2:]
                    model_suffix = ""
                    is_stem = False
                    # Vérifier si le dernier élément est 'rm3'
                    if model_parts and model_parts[-1] == 'rm3':
                        model_suffix += "+RM3"
                        model_parts = model_parts[:-1] # Enlever 'rm3'
                    # Vérifier si le dernier élément (restant) est 'stem'
                    if model_parts and model_parts[-1] == 'stem':
                        is_stem = True
                        model_parts = model_parts[:-1] # Enlever 'stem'

                    model_type = "_".join(model_parts) # Devrait être BM25 ou QLD

                    # Utiliser un nom d'index plus clair pour l'affichage
                    if index_type_raw == "preproc":
                         display_index_type = "Preprocessed (Stem)" if is_stem else "Preprocessed (??)" # Devrait être Stem ici
                    else:
                         display_index_type = "Baseline"

                    # Ajouter les résultats au résumé
                    results_summary.append({
                        "Run Name": run_name, "Index": display_index_type,
                        "Query Type": query_type.capitalize(),
                        "Weighting Scheme": model_type.upper() + model_suffix, # Ex: BM25, QLD, BM25+RM3
                        "MAP": avg_map, "P@10": avg_p10
                    })
                else: print(f"  Avertissement: Impossible parser nom run '{run_name}'.")

            except FileNotFoundError: print(f"  Erreur: Fichier run non trouvé: {run_file}")
            except Exception as e: print(f"  Erreur évaluation {run_name}: {e}"); traceback.print_exc()

        # Afficher et sauvegarder le résumé final
        if results_summary:
            print("\n\n=== Tableau Récapitulatif Final (Tous les Runs) ===")
            results_df = pd.DataFrame(results_summary)
            # Trier pour une meilleure lisibilité
            results_df = results_df.sort_values(by=["Index", "Query Type", "Weighting Scheme"])

            # Afficher le DataFrame complet
            print("\n--- Résultats Complets Finaux ---")
            print(results_df.to_markdown(index=False, floatfmt=".4f"))

            # Essayer d'afficher les tableaux pivots
            try:
                pivot_map = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='MAP')
                print("\n--- MAP Final (Tableau Pivot) ---")
                print(pivot_map.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot MAP: {e_pivot})")

            try:
                pivot_p10 = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='P@10')
                print("\n--- P@10 Final (Tableau Pivot) ---")
                print(pivot_p10.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot P@10: {e_pivot})")

            # Sauvegarder le DataFrame complet final
            summary_file_path = os.path.join(EVAL_DIR, "evaluation_summary_final.csv") # Nom final
            try:
                 results_df.to_csv(summary_file_path, index=False)
                 print(f"\nTableau récapitulatif complet sauvegardé: {summary_file_path}")
            except Exception as e_save: print(f"\nErreur sauvegarde résumé: {e_save}")
        else: print("\nAucun résultat d'évaluation à afficher.")



# --- Nouvelle Cellule ---

# === Sauvegarde Finale (Stemming+RM3) vers Nouveau Dossier Drive ===
import os
import subprocess
import time
from datetime import datetime # Pour ajouter une date au nom du dossier

# Chemin de base du projet sur Drive (vérifiez qu'il est correct)
try: DRIVE_PROJECT_PATH
except NameError: print("ERREUR: DRIVE_PROJECT_PATH non défini."); raise

# Dossier source dans Colab à sauvegarder
SOURCE_DIR_TO_SAVE = "/content/ap_output"

# --- Création d'un NOUVEAU nom de dossier cible sur Drive ---
# Ajouter une date ou une description pour le distinguer
timestamp = datetime.now().strftime("%Y%m%d") # Format YYYYMMDD
TARGET_BACKUP_DIR_ON_DRIVE = os.path.join(DRIVE_PROJECT_PATH, f"colab_output_backup_stem_rm3_{timestamp}")
# Ou un nom plus simple si vous préférez:
# TARGET_BACKUP_DIR_ON_DRIVE = os.path.join(DRIVE_PROJECT_PATH, "colab_output_backup_stemming_final")

print(f"Source à sauvegarder : {SOURCE_DIR_TO_SAVE}")
print(f"Cible sur Drive (Nouveau Dossier) : {TARGET_BACKUP_DIR_ON_DRIVE}")

# Vérifier si le dossier source existe
if os.path.exists(SOURCE_DIR_TO_SAVE):
    # Créer le dossier cible sur Drive (ne devrait pas exister)
    print(f"Création du dossier cible: {TARGET_BACKUP_DIR_ON_DRIVE}")
    # Utiliser -p pour ne pas échouer s'il existe par hasard et créer les parents si besoin
    os.makedirs(TARGET_BACKUP_DIR_ON_DRIVE, exist_ok=True)

    print("\nCopie des fichiers vers Google Drive en cours... (Cela peut prendre plusieurs minutes)")
    # Utiliser cp -r (récursif) et -v (verbeux). Pas besoin de -u car c'est un nouveau dossier.
    copy_cmd = f"cp -r -v '{SOURCE_DIR_TO_SAVE}/.' '{TARGET_BACKUP_DIR_ON_DRIVE}/'"
    try:
        process = subprocess.run(copy_cmd, shell=True, check=True, capture_output=True, text=True, timeout=900) # Timeout 15 minutes
        print("... (Sortie de la copie) ...")
        stderr_lines = process.stderr.splitlines()
        print("\nExtrait de la fin de la sortie STDERR (fichiers copiés):")
        for line in stderr_lines[-20:]: print(line) # Afficher les 20 dernières lignes copiées

        print("\nSauvegarde terminée avec succès !")
        print(f"Le contenu de {SOURCE_DIR_TO_SAVE} a été copié dans le NOUVEAU dossier {TARGET_BACKUP_DIR_ON_DRIVE}")
        # Vérifier rapidement si les dossiers runs et eval existent dans la sauvegarde
        print("\nVérification de la sauvegarde sur Drive (partiel):")
        print(f"Contenu de {TARGET_BACKUP_DIR_ON_DRIVE}/runs :")
        !ls -l "{TARGET_BACKUP_DIR_ON_DRIVE}/runs" | head -n 10 # Devrait montrer 9 fichiers .txt
        print(f"\nContenu de {TARGET_BACKUP_DIR_ON_DRIVE}/eval :")
        !ls -l "{TARGET_BACKUP_DIR_ON_DRIVE}/eval" # Devrait montrer evaluation_summary_final.csv
    except subprocess.CalledProcessError as e:
         print(f"\nERREUR lors de la sauvegarde (code {e.returncode}).")
         print("STDERR:", e.stderr)
    except Exception as e:
        print(f"\nERREUR inattendue lors de la sauvegarde: {e}")
else:
    print(f"Le dossier source {SOURCE_DIR_TO_SAVE} n'existe pas, aucune sauvegarde effectuée.")

Pour citer ce code :

Loyer, Dominique. (2024). parallèlisation_projet1_taln_7_8avril2025.ipynb [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

projetFinal_sysCred_Onto_28avril 1-at-2025-07-0.ttl

Erreur lors de la génération de la description.

Mots-clés: erreur, api

@prefix : <http://www.semanticweb.org/valtchev/ontologies/2018/11/untitled-ontology-37/2025/3/26/untitled-ontology-33/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@base <http://www.semanticweb.org/valtchev/ontologies/2018/11/untitled-ontology-37/2025/3/26/untitled-ontology-33/> .

<http://www.semanticweb.org/valtchev/ontologies/2018/11/untitled-ontology-37/2025/3/26/untitled-ontology-33> rdf:type owl:Ontology ;
                                                                                                              owl:imports <http://www.dic9335.uqam.ca/ontologies/credibility-verification> .

###  Generated by the OWL API (version 4.5.29.2024-05-13T12:11:03Z) https://github.com/owlcs/owlapi

Pour citer ce code :

Loyer, Dominique. (2024). projetFinal_sysCred_Onto_28avril 1-at-2025-07-0.ttl [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

pwc.R

Erreur lors de la génération de la description.

Mots-clés: erreur, api

2044 master and doctorate plus grand que 50k 1181 = 3,87%
23265 plus petit ou égal 50k


missing data
workclass 1769
occupation Status 1774
native country 531	

Census <- read_excel("~/Desktop/PwC/Census.xlsx")
View(Census)
summary(Census)
attach(Census)
str(Census)
head(Census)
tail(Census)
getwd()
pwc <- read_excel("Census.xlsx")
view(pwc)
pwc
View(pwc)
str(pwc)
head(pwc)
subset (pwc, Education %in% "Masters")
subset (pwc, Education %in% "Masters" AND `Income Group` %in% "50k" OR `Income Group` %in% "50k.")
subset (pwc, Education %in% "Masters" AND Income Group %in% "50k" OR Income Group %in% "50k.")
subset (pwc, Education %in% "Masters" OR "Doctorates")
subset (pwc, Education %in% "Masters" | "Doctorates")
subset (pwc, Education %in% "Masters"|"Doctorates")
subset (pwc, Education %in% "Masters")
master <- subset (pwc, Education %in% "Masters")
filter(master)
master50kplus <- subset (pwc, `Income Group` %in% "50k")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% "50k.")
master50kplusDot
master
master50kplus <- subset (master, `Income Group` %in% "50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% ">50k")
master50kplus
master50kplus <- subset (master, `Income Group` %in% >50k)
master50kplus <- subset (master, `Income Group` >50k)
master50kplus <- subset (master, `Income Group` >50k)
master
master$`Income Group`
master50kplus <- subset (master, `Income Group` %in% ">50K")
master50kplus
master50kplusDot <- subset (master50kplus, `Income Group` %in% ">50K.")
master50kplusDot
master50kplus
master50kplusDot <- subset (master, `Income Group` %in% ">50K.")
master50kplus+master50kplusDot
master50kplusDot
dim(master50kplus)
dim(master50kplusDot)
426+500
totalMaster50kplus <- 500+426
totalMaster50kplus
doctorate <- subset (pwc, Education %in% "Doctorate")
doctorate
doctorate50kplus <- subset (doctorate, `Income Group` %in% ">50K")
doctorate50kplus
doctorate50kplusDot <- subset (doctorate, `Income Group` %in% ">50K.")
doctorate50kplusDot
dim(doctorate50kplus)
dim(doctorate50kplusDot)
totalDoctorate50plus <- 130+125
totalDoctorate50plus
totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus <- totalDoctorate50plus+totalMaster50kplus
TotalDocPlusMasterEarn50kplus / 30511
people50less <- subset (pwc, `Income Group` %in% "<=50K")
people50less
people50lessDot <- subset (pwc, `Income Group` %in% "<=50K.")
people50lessDot
dim(people50less)
dim(people50lessDot)
peopleLess50K <- 10835+12430
peopleLess50K
plot(peopleLess50K)
hist(peopleLess50K)
hist(pwc)
subset (pwc, Age <=18)
subset (pwc, Age <=15)
subset (pwc, Age <=16)
subset (pwc, Age <=17)

Pour citer ce code :

Loyer, Dominique. (2024). pwc.R [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

randomForestClassifier_Desjardins.ipynb

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import csv
import dask.dataframe as dd
import pandas as pd
import numpy as np
import scipy as sc
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
df1.data = pd.read_csv("",sep = ';',encoding = "ISO-8859-1")
df1.data.shape, df1.target.shape


# --- Nouvelle Cellule ---

print (np.shape(df1))

# --- Nouvelle Cellule ---



# --- Nouvelle Cellule ---

df1.head



# --- Nouvelle Cellule ---

pip install -U scikit-learn

# --- Nouvelle Cellule ---

X_train, X_test, y_train, y_test = train_test_split(
    df1.data, df1.target, test_size=0.2, random_state=0)

X_train.shape, y_train.shape

X_test.shape, y_test.shape


clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)                           



# --- Nouvelle Cellule ---

df1.data.shape, df1.target.shape

# --- Nouvelle Cellule ---

Pour citer ce code :

Loyer, Dominique. (2024). randomForestClassifier_Desjardins.ipynb [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

sysCRED_onto26avrtil.ttl

Erreur lors de la génération de la description.

Mots-clés: erreur, api

@base <http://www.dic9335.uqam.ca/ontologies/credibility-verification#> .
@prefix : <http://www.dic9335.uqam.ca/ontologies/credibility-verification#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
# 
# 
# #################################################################
# #
# #    Annotation properties
# #
# #################################################################
# 
# 
# http://www.w3.org/2002/07/owl#maxCardinality
# 
# 
# 
# #################################################################
# #
# #    Object Properties
# #
# #################################################################
# 
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#analyzesSource
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#appliesRule
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#assignsCredibilityLevel
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#basedOnEvidence
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#concernsCriterion
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#concernsInformation
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#configuredByExpert
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#evaluatesCriterion
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#fetchesDataFrom
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#hasAuthor
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#hasCriterionResult
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#hasOriginalSource
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#includesNLPResult
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#includesRuleResult
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#includesSourceAnalysis
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#isReportOf
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#isSubjectOfRequest
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#obtainedVia
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#originatesFrom
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#producesReport
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#submitsRequest
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#submittedBy
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#usesModel
# 
# 
# 
# #################################################################
# #
# #    Data properties
# #
# #################################################################
# 
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#authorName
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#coherenceScore
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#completionTimestamp
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#credibilityLevelValue
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#credibilityScoreValue
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#criterionResultConfidence
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#criterionResultValue
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#detectedBiases
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#evidenceSnippet
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#evidenceURL
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#informationContent
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#informationURL
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#modelName
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#modelType
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#reportSummary
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#requestStatus
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ruleDescription
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ruleLogic
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ruleResultValid
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ruleWeight
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#sentimentScore
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#sourceAnalyzedReputation
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#sourceAnalyzedURL
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#sourceMentionsCount
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#sourceReputationScore
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#sourceURL
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#submissionTimestamp
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#userName
# 
# 
# 
# #################################################################
# #
# #    Classes
# #
# #################################################################
# 
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#AcademicJournal
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ApiLLM
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Author
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#BaseDeFaits
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#CredibilityLevel
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Evidence
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Expert
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#FactCheckingOrganization
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#InfoSourceAnalyse
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#InformationFaibleCredibilite
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#InformationHauteCredibilite
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#InformationMoyenneCredibilite
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#InformationSoumise
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#InformationVerifiee
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ModeleIA
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#MoteurRecherche
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#NewsWebsite
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_Bas
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_Haut
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_Moyen
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_NonVerifie
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#PersonalBlog
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#RapportEvaluation
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#RefutingEvidence
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#RegleVerification
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#RequeteEvaluation
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ResultatCritere
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ResultatNLP
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ResultatRegle
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#ResultatVerification
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#SocialMediaPlatform
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Source
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#SupportingEvidence
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#SystemeExterne
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#User
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#VerificationCriterion
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#VerificationMethod
# 
# 
# 
# #################################################################
# #
# #    Individuals
# #
# #################################################################
# 
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Criteria_AuthorExpertise
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Criteria_CoherenceAnalysis
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Criteria_CrossReferencing
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Criteria_FactCheckDB
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Criteria_SourceReputation
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Criteria_ToneAnalysis
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_Bas
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_Haut
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_Moyen
# 
# http://www.dic9335.uqam.ca/ontologies/credibility-verification#Niveau_NonVerifie
# 
# 
# 
# #################################################################
# #
# #    Annotations
# #
# #################################################################
# 
# 
# 
# 
# 
# 
# 
# 
# #################################################################
# #
# #    General axioms
# #
# #################################################################
# 
# 
# 
# 
# 
# 
# Generated by the OWL API (version 4.5.29.2024-05-13T12:11:03Z) https://github.com/owlcs/owlapi

<credibility-verification> a owl:Ontology;
  rdfs:comment "Ontologie enrichie et adaptée modélisant les concepts liés à la vérification de la crédibilité des sources d'information sur le Web, basée sur le rapport de modélisation UML et inspirée par l'ontologie de subvention recherche."@fr;
  rdfs:label "Ontologie Système de Vérification de Sources (Adaptée Rapport + Subvention)"@fr;
  owl:versionInfo "2.1" .

owl:maxCardinality a owl:AnnotationProperty .

:analyzesSource a owl:ObjectProperty;
  rdfs:domain :InfoSourceAnalyse;
  rdfs:range :Source;
  rdfs:label "analyse source"@fr .

:appliesRule a owl:ObjectProperty, owl:FunctionalProperty;
  rdfs:domain :ResultatRegle;
  rdfs:range :RegleVerification;
  rdfs:label "applique règle"@fr .

:assignsCredibilityLevel a owl:ObjectProperty, owl:FunctionalProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range :CredibilityLevel;
  rdfs:comment "Lie un rapport d'évaluation au niveau de crédibilité final attribué."@fr;
  rdfs:label "assigne niveau crédibilité"@fr .

:basedOnEvidence a owl:ObjectProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range :Evidence;
  rdfs:comment "Lie un rapport d'évaluation aux preuves collectées."@fr;
  rdfs:label "basé sur preuve"@fr .

:concernsCriterion a owl:ObjectProperty, owl:FunctionalProperty;
  rdfs:domain :ResultatCritere;
  rdfs:range :VerificationCriterion;
  rdfs:label "concerne critère"@fr .

:concernsInformation a owl:ObjectProperty, owl:FunctionalProperty;
  owl:inverseOf :isSubjectOfRequest;
  rdfs:domain :RequeteEvaluation;
  rdfs:range :InformationSoumise;
  rdfs:label "concerne information"@fr .

:configuredByExpert a owl:ObjectProperty;
  rdfs:domain _:genid1;
  rdfs:range :Expert;
  rdfs:label "configuré par expert"@fr .

_:genid1 a owl:Class;
  owl:unionOf _:genid4 .

_:genid4 a rdf:List;
  rdf:first :ModeleIA;
  rdf:rest _:genid3 .

_:genid3 a rdf:List;
  rdf:first :RegleVerification;
  rdf:rest _:genid2 .

_:genid2 a rdf:List;
  rdf:first :VerificationCriterion;
  rdf:rest rdf:nil .

:evaluatesCriterion a owl:ObjectProperty;
  rdfs:domain _:genid5;
  rdfs:range :VerificationCriterion;
  rdfs:comment "Lie une règle ou un modèle au critère de vérification qu'il est conçu pour évaluer."@fr;
  rdfs:label "évalue critère"@fr .

_:genid5 a owl:Class;
  owl:unionOf _:genid7 .

_:genid7 a rdf:List;
  rdf:first :ModeleIA;
  rdf:rest _:genid6 .

_:genid6 a rdf:List;
  rdf:first :RegleVerification;
  rdf:rest rdf:nil .

:fetchesDataFrom a owl:ObjectProperty;
  rdfs:domain :RequeteEvaluation;
  rdfs:range :SystemeExterne;
  rdfs:label "récupère données de"@fr .

:hasAuthor a owl:ObjectProperty;
  rdfs:domain :InformationSoumise;
  rdfs:range :Author;
  rdfs:comment "Lie une information soumise à son auteur présumé."@fr;
  rdfs:label "a pour auteur"@fr .

:hasCriterionResult a owl:ObjectProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range :ResultatCritere;
  rdfs:comment "Lie un rapport au résultat détaillé pour un critère d'évaluation spécifique."@fr;
  rdfs:label "a résultat pour critère"@fr .

:hasOriginalSource a owl:ObjectProperty;
  rdfs:domain :InformationSoumise;
  rdfs:range :Source;
  rdfs:comment "Lie une information soumise à sa source d'origine principale."@fr;
  rdfs:label "a pour source originale"@fr .

:includesNLPResult a owl:ObjectProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range :ResultatNLP;
  rdfs:label "inclut résultat NLP"@fr .

:includesRuleResult a owl:ObjectProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range :ResultatRegle;
  rdfs:label "inclut résultat règle"@fr .

:includesSourceAnalysis a owl:ObjectProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range :InfoSourceAnalyse;
  rdfs:label "inclut analyse source"@fr .

:isReportOf a owl:ObjectProperty, owl:InverseFunctionalProperty;
  owl:inverseOf :producesReport;
  rdfs:domain :RapportEvaluation;
  rdfs:range :RequeteEvaluation;
  rdfs:label "est rapport de"@fr .

:isSubjectOfRequest a owl:ObjectProperty;
  rdfs:domain :InformationSoumise;
  rdfs:range :RequeteEvaluation;
  rdfs:label "est sujet de requête"@fr .

:obtainedVia a owl:ObjectProperty;
  rdfs:domain :ResultatCritere;
  rdfs:range _:genid8;
  rdfs:label "obtenu via"@fr .

_:genid8 a owl:Class;
  owl:unionOf _:genid10 .

_:genid10 a rdf:List;
  rdf:first :ResultatNLP;
  rdf:rest _:genid9 .

_:genid9 a rdf:List;
  rdf:first :ResultatRegle;
  rdf:rest rdf:nil .

:originatesFrom a owl:ObjectProperty;
  rdfs:domain :Evidence;
  rdfs:range :Source;
  rdfs:comment "Lie une preuve à la source d'où elle a été extraite."@fr;
  rdfs:label "provient de"@fr .

:producesReport a owl:ObjectProperty, owl:FunctionalProperty;
  rdfs:domain :RequeteEvaluation;
  rdfs:range :RapportEvaluation;
  rdfs:label "produit rapport"@fr .

:submitsRequest a owl:ObjectProperty;
  owl:inverseOf :submittedBy;
  rdfs:domain :User;
  rdfs:range :RequeteEvaluation;
  rdfs:label "soumet requête"@fr .

:submittedBy a owl:ObjectProperty, owl:FunctionalProperty;
  rdfs:domain :RequeteEvaluation;
  rdfs:range :User;
  rdfs:comment "Lie une requête de vérification à l'utilisateur qui l'a soumise."@fr;
  rdfs:label "soumise par"@fr .

:usesModel a owl:ObjectProperty, owl:FunctionalProperty;
  rdfs:domain :ResultatNLP;
  rdfs:range :ModeleIA;
  rdfs:label "utilise modèle"@fr .

:authorName a owl:DatatypeProperty;
  rdfs:domain :Author;
  rdfs:range xsd:string;
  rdfs:label "nom de l'auteur"@fr .

:coherenceScore a owl:DatatypeProperty;
  rdfs:domain :ResultatNLP;
  rdfs:range xsd:float;
  rdfs:label "score cohérence"@fr .

:completionTimestamp a owl:DatatypeProperty, owl:FunctionalProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range xsd:dateTime;
  rdfs:label "horodatage de complétion"@fr .

:credibilityLevelValue a owl:DatatypeProperty, owl:FunctionalProperty;
  rdfs:domain :CredibilityLevel;
  rdfs:range xsd:float;
  rdfs:label "valeur numérique niveau"@fr .

:credibilityScoreValue a owl:DatatypeProperty, owl:FunctionalProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range xsd:float;
  rdfs:label "valeur score crédibilité"@fr .

:criterionResultConfidence a owl:DatatypeProperty;
  rdfs:domain :ResultatCritere;
  rdfs:range xsd:float;
  rdfs:label "confiance résultat critère"@fr .

:criterionResultValue a owl:DatatypeProperty;
  rdfs:domain :ResultatCritere;
  rdfs:range xsd:string;
  rdfs:label "valeur résultat critère"@fr .

:detectedBiases a owl:DatatypeProperty;
  rdfs:domain :ResultatNLP;
  rdfs:range xsd:string;
  rdfs:comment "";
  rdfs:label "biais détectés"@fr .

:evidenceSnippet a owl:DatatypeProperty;
  rdfs:domain :Evidence;
  rdfs:range xsd:string;
  rdfs:label "extrait de la preuve"@fr .

:evidenceURL a owl:DatatypeProperty;
  rdfs:domain :Evidence;
  rdfs:range xsd:anyURI;
  rdfs:label "URL de la preuve"@fr .

:informationContent a owl:DatatypeProperty;
  rdfs:domain :InformationSoumise;
  rdfs:range xsd:string;
  rdfs:label "contenu de l'information"@fr .

:informationURL a owl:DatatypeProperty;
  rdfs:domain :InformationSoumise;
  rdfs:range xsd:anyURI;
  rdfs:label "URL de l'information"@fr .

:modelName a owl:DatatypeProperty;
  rdfs:domain :ModeleIA;
  rdfs:range xsd:string;
  rdfs:label "nom modèle"@fr .

:modelType a owl:DatatypeProperty;
  rdfs:domain :ModeleIA;
  rdfs:range xsd:string;
  rdfs:label "type modèle"@fr .

:reportSummary a owl:DatatypeProperty;
  rdfs:domain :RapportEvaluation;
  rdfs:range xsd:string;
  rdfs:label "résumé du rapport"@fr .

:requestStatus a owl:DatatypeProperty, owl:FunctionalProperty;
  rdfs:domain :RequeteEvaluation;
  rdfs:range xsd:string;
  rdfs:label "statut requête"@fr .

:ruleDescription a owl:DatatypeProperty;
  rdfs:domain :RegleVerification;
  rdfs:range xsd:string;
  rdfs:label "description règle"@fr .

:ruleLogic a owl:DatatypeProperty;
  rdfs:domain :RegleVerification;
  rdfs:range xsd:string;
  rdfs:label "logique règle"@fr .

:ruleResultValid a owl:DatatypeProperty;
  rdfs:domain :ResultatRegle;
  rdfs:range xsd:boolean;
  rdfs:label "résultat règle valide"@fr .

:ruleWeight a owl:DatatypeProperty;
  rdfs:domain :RegleVerification;
  rdfs:range xsd:float;
  rdfs:label "poids règle"@fr .

:sentimentScore a owl:DatatypeProperty;
  rdfs:domain :ResultatNLP;
  rdfs:range xsd:float;
  rdfs:label "score sentiment"@fr .

:sourceAnalyzedReputation a owl:DatatypeProperty;
  rdfs:domain :InfoSourceAnalyse;
  rdfs:range xsd:string;
  rdfs:label "réputation source analysée"@fr .

:sourceAnalyzedURL a owl:DatatypeProperty;
  rdfs:domain :InfoSourceAnalyse;
  rdfs:range xsd:anyURI;
  rdfs:label "URL source analysée"@fr .

:sourceMentionsCount a owl:DatatypeProperty;
  rdfs:domain :InfoSourceAnalyse;
  rdfs:range xsd:integer;
  rdfs:label "mentions source analysée"@fr .

:sourceReputationScore a owl:DatatypeProperty;
  rdfs:domain :Source;
  rdfs:range xsd:float;
  rdfs:label "score de réputation de la source"@fr .

:sourceURL a owl:DatatypeProperty, owl:FunctionalProperty;
  rdfs:domain :Source;
  rdfs:range xsd:anyURI;
  rdfs:label "URL de la source"@fr .

:submissionTimestamp a owl:DatatypeProperty, owl:FunctionalProperty;
  rdfs:domain :RequeteEvaluation;
  rdfs:range xsd:dateTime;
  rdfs:label "horodatage de soumission"@fr .

:userName a owl:DatatypeProperty;
  rdfs:domain :User;
  rdfs:range xsd:string;
  rdfs:label "nom d'utilisateur"@fr .

:AcademicJournal a owl:Class;
  rdfs:subClassOf :Source;
  rdfs:label "Revue Académique"@fr .

:ApiLLM a owl:Class;
  rdfs:subClassOf :SystemeExterne;
  rdfs:label "API de LLM"@fr .

:Author a owl:Class;
  rdfs:comment "Représente la personne ou l'entité créditée pour la création de l'information soumise."@fr;
  rdfs:label "Auteur"@fr .

:BaseDeFaits a owl:Class;
  rdfs:subClassOf :SystemeExterne;
  rdfs:label "Base de Données de Faits Vérifiés"@fr .

:CredibilityLevel a owl:Class;
  rdfs:comment "Représente le niveau de crédibilité qualitatif ou quantitatif attribué dans le rapport."@fr;
  rdfs:label "Niveau de Crédibilité"@fr .

:Evidence a owl:Class;
  rdfs:comment "Représente un élément d'information externe utilisé pour étayer ou réfuter l'information vérifiée."@fr;
  rdfs:label "Preuve"@fr .

:Expert a owl:Class;
  rdfs:subClassOf :User;
  rdfs:comment "Utilisateur qualifié responsable de la configuration et de l'amélioration du système (règles, modèles)."@fr;
  rdfs:label "Expert"@fr .

:FactCheckingOrganization a owl:Class;
  rdfs:subClassOf :Source;
  rdfs:label "Organisation de Vérification des Faits"@fr .

:InfoSourceAnalyse a owl:Class;
  rdfs:subClassOf _:genid11;
  rdfs:comment "Détails sur une source spécifique telle qu'analysée et présentée dans le rapport."@fr;
  rdfs:label "Information Source Analysée"@fr .

_:genid11 a owl:Restriction;
  owl:cardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :analyzesSource .

:InformationFaibleCredibilite a owl:Class;
  owl:equivalentClass _:genid12;
  rdfs:subClassOf _:genid22;
  rdfs:label "Information Faiblement Crédible"@fr .

_:genid12 a owl:Class;
  owl:intersectionOf _:genid21 .

_:genid21 a rdf:List;
  rdf:first :InformationVerifiee;
  rdf:rest _:genid19 .

_:genid19 a rdf:List;
  rdf:first _:genid20;
  rdf:rest _:genid17 .

_:genid17 a rdf:List;
  rdf:first _:genid18;
  rdf:rest _:genid13 .

_:genid13 a rdf:List;
  rdf:first _:genid14;
  rdf:rest rdf:nil .

_:genid14 a owl:Restriction;
  owl:someValuesFrom _:genid15;
  owl:onProperty :isSubjectOfRequest .

_:genid15 a owl:Restriction;
  owl:someValuesFrom _:genid16;
  owl:onProperty :producesReport .

_:genid16 a owl:Restriction;
  owl:hasValue :Niveau_Bas;
  owl:onProperty :assignsCredibilityLevel .

_:genid18 a owl:Class;
  owl:complementOf :InformationMoyenneCredibilite .

_:genid20 a owl:Class;
  owl:complementOf :InformationHauteCredibilite .

_:genid22 a owl:Restriction;
  owl:allValuesFrom _:genid23;
  owl:onProperty :isSubjectOfRequest .

_:genid23 a owl:Restriction;
  owl:allValuesFrom _:genid24;
  owl:onProperty :producesReport .

_:genid24 a owl:Restriction;
  owl:hasValue :Niveau_Bas;
  owl:onProperty :assignsCredibilityLevel .

:InformationHauteCredibilite a owl:Class;
  owl:equivalentClass _:genid25;
  rdfs:subClassOf _:genid31;
  rdfs:label "Information Hautement Crédible"@fr .

_:genid25 a owl:Class;
  owl:intersectionOf _:genid30 .

_:genid30 a rdf:List;
  rdf:first :InformationVerifiee;
  rdf:rest _:genid26 .

_:genid26 a rdf:List;
  rdf:first _:genid27;
  rdf:rest rdf:nil .

_:genid27 a owl:Restriction;
  owl:someValuesFrom _:genid28;
  owl:onProperty :isSubjectOfRequest .

_:genid28 a owl:Restriction;
  owl:someValuesFrom _:genid29;
  owl:onProperty :producesReport .

_:genid29 a owl:Restriction;
  owl:hasValue :Niveau_Haut;
  owl:onProperty :assignsCredibilityLevel .

_:genid31 a owl:Restriction;
  owl:allValuesFrom _:genid32;
  owl:onProperty :isSubjectOfRequest .

_:genid32 a owl:Restriction;
  owl:allValuesFrom _:genid33;
  owl:onProperty :producesReport .

_:genid33 a owl:Restriction;
  owl:hasValue :Niveau_Haut;
  owl:onProperty :assignsCredibilityLevel .

:InformationMoyenneCredibilite a owl:Class;
  owl:equivalentClass _:genid34;
  rdfs:subClassOf _:genid42;
  rdfs:label "Information Moyennement Crédible"@fr .

_:genid34 a owl:Class;
  owl:intersectionOf _:genid41 .

_:genid41 a rdf:List;
  rdf:first :InformationVerifiee;
  rdf:rest _:genid39 .

_:genid39 a rdf:List;
  rdf:first _:genid40;
  rdf:rest _:genid35 .

_:genid35 a rdf:List;
  rdf:first _:genid36;
  rdf:rest rdf:nil .

_:genid36 a owl:Restriction;
  owl:someValuesFrom _:genid37;
  owl:onProperty :isSubjectOfRequest .

_:genid37 a owl:Restriction;
  owl:someValuesFrom _:genid38;
  owl:onProperty :producesReport .

_:genid38 a owl:Restriction;
  owl:hasValue :Niveau_Moyen;
  owl:onProperty :assignsCredibilityLevel .

_:genid40 a owl:Class;
  owl:complementOf :InformationHauteCredibilite .

_:genid42 a owl:Restriction;
  owl:allValuesFrom _:genid43;
  owl:onProperty :isSubjectOfRequest .

_:genid43 a owl:Restriction;
  owl:allValuesFrom _:genid44;
  owl:onProperty :producesReport .

_:genid44 a owl:Restriction;
  owl:hasValue :Niveau_Moyen;
  owl:onProperty :assignsCredibilityLevel .

:InformationSoumise a owl:Class;
  rdfs:comment "Représente l'unité d'information (texte, URL) telle que soumise pour vérification."@fr;
  rdfs:label "Information Soumise"@fr .

:InformationVerifiee a owl:Class;
  owl:equivalentClass _:genid45;
  rdfs:label "Information Vérifiée"@fr .

_:genid45 a owl:Class;
  owl:intersectionOf _:genid49 .

_:genid49 a rdf:List;
  rdf:first :InformationSoumise;
  rdf:rest _:genid46 .

_:genid46 a rdf:List;
  rdf:first _:genid47;
  rdf:rest rdf:nil .

_:genid47 a owl:Restriction;
  owl:someValuesFrom _:genid48;
  owl:onProperty :isSubjectOfRequest .

_:genid48 a owl:Restriction;
  owl:someValuesFrom :RapportEvaluation;
  owl:onProperty :producesReport .

:ModeleIA a owl:Class;
  rdfs:subClassOf :VerificationMethod, _:genid50;
  rdfs:comment "Représente un modèle d'apprentissage automatique utilisé pour l'analyse sémantique ou autre."@fr;
  rdfs:label "Modèle IA/NLP"@fr .

_:genid50 a owl:Restriction;
  owl:minCardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :evaluatesCriterion .

:MoteurRecherche a owl:Class;
  rdfs:subClassOf :SystemeExterne;
  rdfs:label "Moteur de Recherche"@fr .

:NewsWebsite a owl:Class;
  rdfs:subClassOf :Source;
  rdfs:label "Site d'actualités"@fr .

:Niveau_Bas a owl:Class, owl:NamedIndividual, :CredibilityLevel;
  :credibilityLevelValue "0.2"^^xsd:float;
  rdfs:label "Crédibilité Faible"@fr .

:Niveau_Haut a owl:Class, owl:NamedIndividual, :CredibilityLevel;
  :credibilityLevelValue "0.8"^^xsd:float;
  rdfs:label "Crédibilité Élevée"@fr .

:Niveau_Moyen a owl:Class, owl:NamedIndividual, :CredibilityLevel;
  :credibilityLevelValue "0.5"^^xsd:float;
  rdfs:label "Crédibilité Moyenne"@fr .

:Niveau_NonVerifie a owl:Class, owl:NamedIndividual, :CredibilityLevel;
  rdfs:label "Non Vérifié"@fr .

:PersonalBlog a owl:Class;
  rdfs:subClassOf :Source;
  rdfs:label "Blog Personnel"@fr .

:RapportEvaluation a owl:Class;
  rdfs:subClassOf _:genid51;
  rdfs:comment "Encapsule les résultats complets du processus de vérification pour une requête donnée."@fr;
  rdfs:label "Rapport d'Évaluation"@fr .

_:genid51 a owl:Restriction;
  owl:cardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :assignsCredibilityLevel .

:RefutingEvidence a owl:Class;
  rdfs:subClassOf :Evidence;
  owl:disjointWith :SupportingEvidence;
  rdfs:label "Preuve réfutante"@fr .

:RegleVerification a owl:Class;
  rdfs:subClassOf :VerificationMethod, _:genid52;
  rdfs:comment "Représente une règle logique prédéfinie utilisée pour évaluer un aspect de la crédibilité."@fr;
  rdfs:label "Règle de Vérification"@fr .

_:genid52 a owl:Restriction;
  owl:minCardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :evaluatesCriterion .

:RequeteEvaluation a owl:Class;
  rdfs:subClassOf _:genid53, _:genid54, _:genid55;
  rdfs:comment "Représente une demande spécifique de vérification de crédibilité soumise par un utilisateur."@fr;
  rdfs:label "Requête d'Évaluation"@fr .

_:genid53 a owl:Restriction;
  owl:minCardinality "0"^^xsd:nonNegativeInteger;
  owl:onProperty :producesReport .

_:genid54 a owl:Restriction;
  owl:cardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :concernsInformation .

_:genid55 a owl:Restriction;
  owl:cardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :submittedBy .

:ResultatCritere a owl:Class;
  rdfs:subClassOf _:genid56, _:genid57;
  rdfs:comment "Représente le résultat de l'évaluation d'un critère spécifique pour une requête, potentiellement basé sur un ou plusieurs résultats de règles/NLP."@fr;
  rdfs:label "Résultat Critère"@fr .

_:genid56 a owl:Restriction;
  owl:minCardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :obtainedVia .

_:genid57 a owl:Restriction;
  owl:cardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :concernsCriterion .

:ResultatNLP a owl:Class;
  rdfs:subClassOf :ResultatVerification, _:genid58;
  owl:disjointWith :ResultatRegle;
  rdfs:comment "Résultat de l'analyse effectuée par un modèle IA/NLP."@fr;
  rdfs:label "Résultat NLP"@fr .

_:genid58 a owl:Restriction;
  owl:cardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :usesModel .

:ResultatRegle a owl:Class;
  rdfs:subClassOf :ResultatVerification, _:genid59;
  rdfs:comment "Résultat de l'application d'une règle de vérification spécifique."@fr;
  rdfs:label "Résultat Règle"@fr .

_:genid59 a owl:Restriction;
  owl:cardinality "1"^^xsd:nonNegativeInteger;
  owl:onProperty :appliesRule .

:ResultatVerification a owl:Class;
  rdfs:comment "Classe parente pour les résultats issus des différentes méthodes de vérification."@fr;
  rdfs:label "Résultat de Vérification (Interne)"@fr .

:SocialMediaPlatform a owl:Class;
  rdfs:subClassOf :Source;
  rdfs:label "Plateforme de Média Social"@fr .

:Source a owl:Class;
  rdfs:comment "Représente une entité (site web, organisation, personne) d'où provient l'information originale ou la preuve."@fr;
  rdfs:label "Source"@fr .

:SupportingEvidence a owl:Class;
  rdfs:subClassOf :Evidence;
  rdfs:label "Preuve à l'appui"@fr .

:SystemeExterne a owl:Class;
  rdfs:comment "Représente une source de données ou un service externe utilisé pendant le processus de vérification (API, base de données)."@fr;
  rdfs:label "Système Externe"@fr .

:User a owl:Class;
  rdfs:comment "Représente une personne interagissant avec le système de vérification."@fr;
  rdfs:label "Utilisateur"@fr .

:VerificationCriterion a owl:Class;
  rdfs:comment "Aspect spécifique évalué lors de la vérification (ex: réputation de la source, cohérence)."@fr;
  rdfs:label "Critère de Vérification"@fr .

:VerificationMethod a owl:Class;
  rdfs:comment "Représente une approche (règle, modèle IA) utilisée pour évaluer la crédibilité."@fr;
  rdfs:label "Méthode de Vérification"@fr .

:Criteria_AuthorExpertise a owl:NamedIndividual, :VerificationCriterion;
  rdfs:label "Expertise de l'auteur"@fr .

:Criteria_CoherenceAnalysis a owl:NamedIndividual, :VerificationCriterion;
  rdfs:label "Analyse de la cohérence"@fr .

:Criteria_CrossReferencing a owl:NamedIndividual, :VerificationCriterion;
  rdfs:label "Références croisées"@fr .

:Criteria_FactCheckDB a owl:NamedIndividual, :VerificationCriterion;
  rdfs:label "Consultation base de données Fact-Check"@fr .

:Criteria_SourceReputation a owl:NamedIndividual, :VerificationCriterion;
  rdfs:label "Réputation de la source"@fr .

:Criteria_ToneAnalysis a owl:NamedIndividual, :VerificationCriterion;
  rdfs:label "Analyse du ton (ex: neutre, biaisé)"@fr .

_:genid60 owl:maxCardinality "1"^^xsd:nonNegativeInteger .

_:genid61 a owl:AllDisjointClasses;
  owl:members _:genid66 .

_:genid66 a rdf:List;
  rdf:first :AcademicJournal;
  rdf:rest _:genid65 .

_:genid65 a rdf:List;
  rdf:first :FactCheckingOrganization;
  rdf:rest _:genid64 .

_:genid64 a rdf:List;
  rdf:first :NewsWebsite;
  rdf:rest _:genid63 .

_:genid63 a rdf:List;
  rdf:first :PersonalBlog;
  rdf:rest _:genid62 .

_:genid62 a rdf:List;
  rdf:first :SocialMediaPlatform;
  rdf:rest rdf:nil .

_:genid67 a owl:AllDisjointClasses;
  owl:members _:genid70 .

_:genid70 a rdf:List;
  rdf:first :ApiLLM;
  rdf:rest _:genid69 .

_:genid69 a rdf:List;
  rdf:first :BaseDeFaits;
  rdf:rest _:genid68 .

_:genid68 a rdf:List;
  rdf:first :MoteurRecherche;
  rdf:rest rdf:nil .

_:genid71 a owl:AllDisjointClasses;
  owl:members _:genid74 .

_:genid74 a rdf:List;
  rdf:first :InformationFaibleCredibilite;
  rdf:rest _:genid73 .

_:genid73 a rdf:List;
  rdf:first :InformationHauteCredibilite;
  rdf:rest _:genid72 .

_:genid72 a rdf:List;
  rdf:first :InformationMoyenneCredibilite;
  rdf:rest rdf:nil .

_:genid75 a owl:AllDisjointClasses;
  owl:members _:genid79 .

_:genid79 a rdf:List;
  rdf:first :Niveau_Bas;
  rdf:rest _:genid78 .

_:genid78 a rdf:List;
  rdf:first :Niveau_Haut;
  rdf:rest _:genid77 .

_:genid77 a rdf:List;
  rdf:first :Niveau_Moyen;
  rdf:rest _:genid76 .

_:genid76 a rdf:List;
  rdf:first :Niveau_NonVerifie;
  rdf:rest rdf:nil .

Pour citer ce code :

Loyer, Dominique. (2024). sysCRED_onto26avrtil.ttl [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

sysTradfr_en 2.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

### Comment faire fonctionner votre traducteur

Ce nouveau script est une application complète. Voici les étapes pour la lancer.

#### Prérequis : Votre Modèle Entraîné

L'application a besoin du résultat de votre premier script. Après avoir lancé votre script d'entraînement, un dossier a été créé. Son nom ressemble à `Helsinki-NLP-opus-mt-fr-en-finetuned-fr-to-en...`. **Ce dossier est votre modèle. Vous en aurez besoin.**

#### Étape 1 : Créer un nouvel environnement virtuel

Il est préférable de créer un nouvel environnement propre pour cette application afin de ne pas mélanger les bibliothèques.

1.  Ouvrez votre Terminal.
2.  Allez sur votre Bureau (`cd Desktop`).
3.  Créez un nouvel environnement :
    ```bash
    python3 -m venv traducteur_env
    ```
4.  Activez-le :
    ```bash
    source traducteur_env/bin/activate
    ```
    Vous devriez voir `(traducteur_env)` au début de votre ligne de commande.

#### Étape 2 : Installer les nouvelles bibliothèques

Pendant que l'environnement est actif, installez les paquets nécessaires. **Attention, il y en a de nouveaux !**

```bash
pip install streamlit
pip install torch
pip install transformers
pip install sentencepiece
pip install PyMuPDF
```
* `streamlit` : Pour l'interface web.
* `torch`, `transformers`, `sentencepiece` : Pour faire fonctionner le modèle de traduction.
* `PyMuPDF` : Pour lire les fichiers PDF.

#### Étape 3 : Sauvegarder et configurer le nouveau script

1.  Copiez le code ci-dessus dans un nouveau fichier que vous nommerez par exemple `app_traduction.py`.
2.  **L'ÉTAPE LA PLUS IMPORTANTE :** Dans le script, trouvez la ligne :
    ```python
    MODEL_PATH = "chemin/vers/votre/modele_fine_tune"
    ```
3.  Remplacez `"chemin/vers/votre/modele_fine_tune"` par le **nom exact du dossier** créé par votre script d'entraînement. Assurez-vous que ce dossier se trouve au même endroit que votre script `app_traduction.py`, ou indiquez le chemin complet.

#### Étape 4 : Lancer l'application

1.  Assurez-vous que votre environnement `(traducteur_env)` est toujours actif dans le terminal.
2.  Lancez l'application avec la commande `streamlit` :
    ```bash
    streamlit run app_traduction.py
    ```

3.  Votre navigateur web devrait s'ouvrir automatiquement sur une nouvelle page. C'est votre application !

Vous pouvez maintenant uploader un PDF, cliquer sur le bouton "Traduire", et voir la magie opérer.

Pour citer ce code :

Loyer, Dominique. (2024). sysTradfr_en 2.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

sysTradfr_en.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

### Comment faire fonctionner votre traducteur

Ce nouveau script est une application complète. Voici les étapes pour la lancer.

#### Prérequis : Votre Modèle Entraîné

L'application a besoin du résultat de votre premier script. Après avoir lancé votre script d'entraînement, un dossier a été créé. Son nom ressemble à `Helsinki-NLP-opus-mt-fr-en-finetuned-fr-to-en...`. **Ce dossier est votre modèle. Vous en aurez besoin.**

#### Étape 1 : Créer un nouvel environnement virtuel

Il est préférable de créer un nouvel environnement propre pour cette application afin de ne pas mélanger les bibliothèques.

1.  Ouvrez votre Terminal.
2.  Allez sur votre Bureau (`cd Desktop`).
3.  Créez un nouvel environnement :
    ```bash
    python3 -m venv traducteur_env
    ```
4.  Activez-le :
    ```bash
    source traducteur_env/bin/activate
    ```
    Vous devriez voir `(traducteur_env)` au début de votre ligne de commande.

#### Étape 2 : Installer les nouvelles bibliothèques

Pendant que l'environnement est actif, installez les paquets nécessaires. **Attention, il y en a de nouveaux !**

```bash
pip install streamlit
pip install torch
pip install transformers
pip install sentencepiece
pip install PyMuPDF
```
* `streamlit` : Pour l'interface web.
* `torch`, `transformers`, `sentencepiece` : Pour faire fonctionner le modèle de traduction.
* `PyMuPDF` : Pour lire les fichiers PDF.

#### Étape 3 : Sauvegarder et configurer le nouveau script

1.  Copiez le code ci-dessus dans un nouveau fichier que vous nommerez par exemple `app_traduction.py`.
2.  **L'ÉTAPE LA PLUS IMPORTANTE :** Dans le script, trouvez la ligne :
    ```python
    MODEL_PATH = "chemin/vers/votre/modele_fine_tune"
    ```
3.  Remplacez `"chemin/vers/votre/modele_fine_tune"` par le **nom exact du dossier** créé par votre script d'entraînement. Assurez-vous que ce dossier se trouve au même endroit que votre script `app_traduction.py`, ou indiquez le chemin complet.

#### Étape 4 : Lancer l'application

1.  Assurez-vous que votre environnement `(traducteur_env)` est toujours actif dans le terminal.
2.  Lancez l'application avec la commande `streamlit` :
    ```bash
    streamlit run app_traduction.py
    ```

3.  Votre navigateur web devrait s'ouvrir automatiquement sur une nouvelle page. C'est votre application !

Vous pouvez maintenant uploader un PDF, cliquer sur le bouton "Traduire", et voir la magie opérer.

Pour citer ce code :

Loyer, Dominique. (2024). sysTradfr_en.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

translator 2.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import fitz  # PyMuPDF
import time

# --- Configuration de la page Streamlit ---
st.set_page_config(
    page_title="Traducteur de Documents",
    page_icon="🤖",
    layout="wide"
)

st.title("Traducteur de Documents (Français vers Anglais)")
st.markdown("Utilisez le modèle que vous avez entraîné sur Kaggle pour traduire vos documents scientifiques.")

# --- Chargement du Modèle ---
# Le cache de Streamlit permet de ne charger le modèle qu'une seule fois.
@st.cache_resource
def load_model(model_id):
    """
    Charge le tokenizer et le modèle depuis le Hugging Face Hub.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
        return tokenizer, model
    except Exception as e:
        st.error(f"Erreur lors du chargement du modèle : {e}")
        st.error("Vérifiez que l'ID du modèle est correct et que le modèle est bien public sur le Hub.")
        return None, None

# --- Fonctions de Traitement ---
def extract_text_from_pdf(file):
    """
    Extrait le texte d'un fichier PDF.
    """
    try:
        doc = fitz.open(stream=file.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        st.error(f"Erreur lors de la lecture du fichier PDF : {e}")
        return None

def translate_text(text, tokenizer, model, progress_bar, status_text):
    """
    Traduit un long texte en le divisant en paragraphes.
    """
    # Diviser le texte en paragraphes pour éviter de surcharger le modèle.
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    translated_text = ""
    total_paragraphs = len(paragraphs)
    
    for i, paragraph in enumerate(paragraphs):
        if not paragraph:
            continue
        
        # Mettre à jour l'interface utilisateur
        progress = (i + 1) / total_paragraphs
        progress_bar.progress(progress)
        status_text.text(f"Traduction du paragraphe {i+1}/{total_paragraphs}...")

        # Traduction
        try:
            inputs = tokenizer(paragraph, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model.generate(**inputs)
            translated_paragraph = tokenizer.decode(outputs[0], skip_special_tokens=True)
            translated_text += translated_paragraph + "\n\n"
        except Exception as e:
            st.warning(f"Impossible de traduire un paragraphe : {e}")
            translated_text += f"[ERREUR DE TRADUCTION DU PARAGRAPHE]\n\n"
        
        # Petite pause pour que l'interface reste fluide
        time.sleep(0.1)

    status_text.text("Traduction terminée !")
    return translated_text

# --- Interface Utilisateur ---
st.header("Étape 1 : Spécifiez votre modèle")
hub_model_id = st.text_input(
    "Entrez l'ID de votre modèle sur le Hugging Face Hub",
    value="VotreNomDUtilisateur/Helsinki-NLP-opus-mt-fr-en-finetuned",
    help="Exemple : 'google-t5/t5-base' ou l'ID de votre propre modèle après l'entraînement."
)

st.header("Étape 2 : Uploadez votre document")
uploaded_file = st.file_uploader(
    "Choisissez un fichier (.txt ou .pdf)",
    type=["txt", "pdf"]
)

if uploaded_file is not None:
    st.success(f"Fichier '{uploaded_file.name}' uploadé avec succès.")
    
    if st.button("Traduire le document"):
        if not hub_model_id or "VotreNomDUtilisateur" in hub_model_id:
            st.warning("Veuillez entrer un ID de modèle valide avant de traduire.")
        else:
            with st.spinner("Chargement du modèle... (cela peut prendre un moment la première fois)"):
                tokenizer, model = load_model(hub_model_id)

            if tokenizer and model:
                st.success("Modèle chargé avec succès.")
                
                # Extraction du texte
                original_text = ""
                if uploaded_file.type == "text/plain":
                    original_text = uploaded_file.read().decode("utf-8")
                elif uploaded_file.type == "application/pdf":
                    original_text = extract_text_from_pdf(uploaded_file)
                
                if original_text:
                    # Lancement de la traduction avec barres de progression
                    st.header("Traduction en cours...")
                    progress_bar = st.progress(0)
                    status_text = st.empty()

                    translated_text = translate_text(original_text, tokenizer, model, progress_bar, status_text)

                    # Affichage des résultats
                    st.header("Résultats")
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.subheader("Texte Original (Français)")
                        st.text_area("", original_text, height=400)
                    
                    with col2:
                        st.subheader("Traduction (Anglais)")
                        st.text_area("", translated_text, height=400)

Pour citer ce code :

Loyer, Dominique. (2024). translator 2.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

translator.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import fitz  # PyMuPDF
import time

# --- Configuration de la page Streamlit ---
st.set_page_config(
    page_title="Traducteur de Documents",
    page_icon="🤖",
    layout="wide"
)

st.title("Traducteur de Documents (Français vers Anglais)")
st.markdown("Utilisez le modèle que vous avez entraîné sur Kaggle pour traduire vos documents scientifiques.")

# --- Chargement du Modèle ---
# Le cache de Streamlit permet de ne charger le modèle qu'une seule fois.
@st.cache_resource
def load_model(model_id):
    """
    Charge le tokenizer et le modèle depuis le Hugging Face Hub.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
        return tokenizer, model
    except Exception as e:
        st.error(f"Erreur lors du chargement du modèle : {e}")
        st.error("Vérifiez que l'ID du modèle est correct et que le modèle est bien public sur le Hub.")
        return None, None

# --- Fonctions de Traitement ---
def extract_text_from_pdf(file):
    """
    Extrait le texte d'un fichier PDF.
    """
    try:
        doc = fitz.open(stream=file.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        st.error(f"Erreur lors de la lecture du fichier PDF : {e}")
        return None

def translate_text(text, tokenizer, model, progress_bar, status_text):
    """
    Traduit un long texte en le divisant en paragraphes.
    """
    # Diviser le texte en paragraphes pour éviter de surcharger le modèle.
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    translated_text = ""
    total_paragraphs = len(paragraphs)
    
    for i, paragraph in enumerate(paragraphs):
        if not paragraph:
            continue
        
        # Mettre à jour l'interface utilisateur
        progress = (i + 1) / total_paragraphs
        progress_bar.progress(progress)
        status_text.text(f"Traduction du paragraphe {i+1}/{total_paragraphs}...")

        # Traduction
        try:
            inputs = tokenizer(paragraph, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model.generate(**inputs)
            translated_paragraph = tokenizer.decode(outputs[0], skip_special_tokens=True)
            translated_text += translated_paragraph + "\n\n"
        except Exception as e:
            st.warning(f"Impossible de traduire un paragraphe : {e}")
            translated_text += f"[ERREUR DE TRADUCTION DU PARAGRAPHE]\n\n"
        
        # Petite pause pour que l'interface reste fluide
        time.sleep(0.1)

    status_text.text("Traduction terminée !")
    return translated_text

# --- Interface Utilisateur ---
st.header("Étape 1 : Spécifiez votre modèle")
hub_model_id = st.text_input(
    "Entrez l'ID de votre modèle sur le Hugging Face Hub",
    value="VotreNomDUtilisateur/Helsinki-NLP-opus-mt-fr-en-finetuned",
    help="Exemple : 'google-t5/t5-base' ou l'ID de votre propre modèle après l'entraînement."
)

st.header("Étape 2 : Uploadez votre document")
uploaded_file = st.file_uploader(
    "Choisissez un fichier (.txt ou .pdf)",
    type=["txt", "pdf"]
)

if uploaded_file is not None:
    st.success(f"Fichier '{uploaded_file.name}' uploadé avec succès.")
    
    if st.button("Traduire le document"):
        if not hub_model_id or "VotreNomDUtilisateur" in hub_model_id:
            st.warning("Veuillez entrer un ID de modèle valide avant de traduire.")
        else:
            with st.spinner("Chargement du modèle... (cela peut prendre un moment la première fois)"):
                tokenizer, model = load_model(hub_model_id)

            if tokenizer and model:
                st.success("Modèle chargé avec succès.")
                
                # Extraction du texte
                original_text = ""
                if uploaded_file.type == "text/plain":
                    original_text = uploaded_file.read().decode("utf-8")
                elif uploaded_file.type == "application/pdf":
                    original_text = extract_text_from_pdf(uploaded_file)
                
                if original_text:
                    # Lancement de la traduction avec barres de progression
                    st.header("Traduction en cours...")
                    progress_bar = st.progress(0)
                    status_text = st.empty()

                    translated_text = translate_text(original_text, tokenizer, model, progress_bar, status_text)

                    # Affichage des résultats
                    st.header("Résultats")
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.subheader("Texte Original (Français)")
                        st.text_area("", original_text, height=400)
                    
                    with col2:
                        st.subheader("Traduction (Anglais)")
                        st.text_area("", translated_text, height=400)

Pour citer ce code :

Loyer, Dominique. (2024). translator.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

TREC_AP88-90_5juin25(Pyserini&Lucene).ipynb

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# === Cellule 0.1: Monter Google Drive ===
from google.colab import drive
drive.mount('/content/drive')

# Vérifiez que le dossier du projet est accessible
# Adaptez le chemin si nécessaire en fonction de l'emplacement dans votre Drive
!ls "/content/drive/MyDrive/Projet_RI"

# --- Nouvelle Cellule ---

# === Cellule de Vérification du Contenu du Dossier Runs (Corrigé) ===
# Utilise les commandes shell de Colab préfixées par '!'

# Chemin exact où les résultats de recherche sont attendus
# (Défini dans la cellule de configuration complète)
RUN_DIR_PATH="/content/ap_output/runs/"

# Utiliser '!' pour exécuter la commande shell 'echo'
print(f"Vérification du contenu de : {RUN_DIR_PATH}")

# Utiliser '!' pour exécuter la commande shell 'ls -lh'
# Mettre le chemin entre guillemets pour gérer les espaces potentiels (même s'il n'y en a pas ici)
!ls -lh "{RUN_DIR_PATH}"


# --- Nouvelle Cellule ---

# === Cellule 4: Exécuter les Recherches (Séquentielles - BM25 & QLD) ===
# Lance les 8 combinaisons de recherche en utilisant BM25 et QLD.
# S'assure que l'environnement Java 21 est actif et que les index/variables sont définis/restaurés.

# Assurer que pyserini est installé avant l'import
# Vous devriez normalement exécuter la Cellule 0 "Configuration Complète" avant celle-ci.
# Cette ligne est ajoutée comme filet de sécurité si la Cellule 0 n'a pas été exécutée
# ou a échoué pour pyserini. Supprimez-la si vous exécutez toujours la Cellule 0.
!pip install pyserini --quiet

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import JavaException # Importer seulement JavaException, ClassicSimilarity n'est pas utilisé

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index restaurés
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; CORPUS_DIR; # Ajout CORPUS_DIR pour vérif jsonl
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline restauré manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed restauré manquant: {INDEX_DIR_PREPROC}")
    # Vérifier aussi que les fichiers de corpus sont là (restaurés ou recréés)
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs.jsonl")): raise FileNotFoundError("ap_docs.jsonl manquant.")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")): raise FileNotFoundError("ap_docs_preprocessed.jsonl manquant.")

except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential_qld(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes (BM25 ou QLD)."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")

        # Configurer similarité
        if model == 'bm25':
            print("  Configuration BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'qld': # Utiliser Query Likelihood Dirichlet
            print("  Configuration QLD..."); searcher.set_qld(); print("  QLD configuré.")
        else:
            print(f"Modèle '{model}' non reconnu, utilise BM25 par défaut."); searcher.set_bm25()

        # Itérer sur les requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # S'assurer que le dossier RUN_DIR existe avant d'écrire
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations (BM25 et QLD) ---
print("\n--- DÉBUT DES RECHERCHES BASELINE (BM25/QLD) ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_qld.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_qld.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES (BM25/QLD) ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_qld.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_qld.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (BM25/QLD) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}

# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (avec Stemming) ===
# Installe Java 21, configure comme défaut, installe outils build,
# pybind11, dernière Pyserini, NLTK+ressources, définit chemins,
# FONCTION preprocess_text AVEC STEMMING, parse topics.

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète (avec Stemming) ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
# S'assurer que nltk est importé
import nltk
# Liste incluant la correction pour punkt_tab
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab']
for resource in nltk_resources:
    try:
        # Déterminer le chemin de recherche correct pour nltk.data.find
        if resource == 'punkt' or resource == 'punkt_tab': # punkt_tab est aussi dans tokenizers
            resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4':
             resource_path = f'corpora/{resource}.zip' # Open Multilingual Wordnet
        elif resource == 'wordnet':
             resource_path = f'corpora/{resource}.zip'
        else: # stopwords, etc.
            resource_path = f'corpora/{resource}.zip'

        # Essayer de trouver la ressource
        nltk.data.find(resource_path)
        # print(f"  Ressource NLTK '{resource}' déjà présente.")

    # Utiliser except LookupError (correction appliquée)
    except LookupError:
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try:
            nltk.download(resource, quiet=True)
            print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download:
            # Capturer les erreurs potentielles de téléchargement (réseau, etc.)
            print(f"  ERREUR lors du téléchargement de '{resource}': {e_download}")

print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive (corrigé)
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Sera recréé avec stemming
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement (AVEC STEMMING) ---
print("\n[8/9] Définition de la fonction preprocess_text (avec Stemming)...")
# S'assurer que nltk est importé avant d'utiliser ses modules
import nltk
from nltk.corpus import stopwords
# --- Utilisation de PorterStemmer ---
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
# --- Création de l'objet Stemmer ---
stemmer_obj_global = PorterStemmer()
def preprocess_text(text):
    """Applique tokenisation, minuscules, suppression ponctuation/non-alpha, stop words ET STEMMING (Porter)."""
    if not isinstance(text, str): return ""
    try:
        tokens = word_tokenize(text.lower())
    except LookupError as e_tok: # Gestion erreur si ressource NLTK manque
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]
              print(f"--- Tokenizer a besoin de '{resource_name}', tentative téléchargement ---")
              try:
                  nltk.download(resource_name, quiet=True)
                  print(f"--- Ressource '{resource_name}' téléchargée, nouvelle tentative de tokenisation ---")
                  tokens = word_tokenize(text.lower()) # Retenter après téléchargement
              except Exception as e_dl_tok:
                  print(f"--- Échec du téléchargement de '{resource_name}': {e_dl_tok} ---")
                  raise e_tok # Relancer l'erreur originale si le téléchargement échoue
         else:
              raise e_tok # Relancer si ce n'est pas une ressource manquante connue
    except Exception as e_tok_other:
         print(f"Erreur inattendue dans word_tokenize: {e_tok_other}")
         raise e_tok_other
    # --- Application du Stemmer ---
    filtered_tokens = [stemmer_obj_global.stem(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie avec PorterStemmer.")
# Tester la nouvelle fonction
sample_text = "This is an example showing Information Retrieval with stemming and stop words removal."
stemmed_sample = preprocess_text(sample_text)
print(f"  Exemple Stemmed: {stemmed_sample}") # Doit afficher 'thi is exampl show inform retriev with stem and stop word remov.'

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    print(f"  Parsing des fichiers topics: {topic_files}")
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
# Mettre la création des dictionnaires prétraités dans un try-except
try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés.")
    print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes (avec stemming)...")
    # Appliquer la NOUVELLE fonction preprocess_text (avec stemming)
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries:
     print(f"\nERREUR lors du prétraitement des requêtes: {e_preproc_queries}")
     print("Les dictionnaires prétraités pourraient être incomplets ou vides.")
     # Créer des dictionnaires vides pour éviter NameError plus tard
     queries_short_preprocessed = {}
     queries_long_preprocessed = {}


# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète (avec Stemming) Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule 1: Extraire, Décompresser et Formater les Documents ===
# Lit AP.tar, décompresse les .gz internes, extrait <DOC>, <DOCNO>, <TEXT>
# et écrit le résultat dans ap_docs.jsonl.

import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    AP_TAR_PATH
    CORPUS_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.") # Devrait être ~275Mo

# Regex pour extraire les infos
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteurs
doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

try:
    # Ouvrir le fichier de sortie et l'archive TAR
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.") # Devrait être ~1051

        # Boucler sur chaque membre de l'archive
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer si ce n'est pas un fichier .gz ou .Z
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Réinitialiser pour chaque fichier

            try:
                # Extraire le contenu compressé
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # Décompresser le contenu
                    try:
                        content_bytes = gzip.decompress(compressed_content)
                        content = content_bytes.decode('utf-8', errors='ignore') # Décoder après décompression
                    except gzip.BadGzipFile: # Gérer si ce n'est pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au suivant

                    # Trouver tous les blocs <DOC> dans le contenu décompressé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches: continue # Passer si aucun doc trouvé

                    # Boucler sur chaque document trouvé
                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match: continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        # Nettoyer le texte (espaces multiples)
                        doc_text = ' '.join(text_match.group(1).strip().split()) if text_match else ""

                        # Écrire la ligne JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key: print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}"); skipped_members += 1
            except EOFError: print(f"\nAvertissement: Fin fichier inattendue {member.name}."); skipped_members += 1
            except Exception as e_extract: print(f"\nErreur extraction/lecture {member.name}: {e_extract}"); skipped_members += 1

except tarfile.ReadError as e_tar: print(f"\nERREUR lecture TAR {AP_TAR_PATH}: {e_tar}"); raise e_tar
except FileNotFoundError: print(f"\nERREUR: Fichier TAR {AP_TAR_PATH} non trouvé."); raise FileNotFoundError
except Exception as e_general: print(f"\nERREUR générale traitement TAR: {e_general}"); traceback.print_exc(); raise e_general

# Afficher le résumé de l'extraction
print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0: print(f"  {decompression_errors} erreurs/avertissements décompression.")
print(f"  {doc_count} documents écrits dans {JSONL_OUTPUT_PATH}") # Devrait être ~240k

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale {JSONL_OUTPUT_PATH}: {output_size} octets.") # Devrait être ~600Mo
    if output_size > 0 and doc_count > 0: print("  SUCCÈS: Fichier de sortie contient des données.")
    else: print("  ATTENTION: Fichier de sortie vide ou aucun document écrit.")
else: print(f"  ATTENTION: Fichier {JSONL_OUTPUT_PATH} non créé.")



# --- Nouvelle Cellule ---

# === Cellule 2: Indexation Baseline ===
# Crée l'index Lucene à partir de ap_docs.jsonl (sans prétraitement spécifique).

import os
import subprocess
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Début de l'indexation Baseline...")
print(f"Dossier source: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction ('extract_code_tar_gzip_fixed') a échoué.")

# Commande Pyserini
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Nombre de threads pour l'indexation
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options de stockage
]

print(f"Exécution: {' '.join(index_cmd_baseline)}")
try:
    # Exécuter la commande d'indexation
    # Augmenter le timeout car cela peut être long
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:]) # Afficher la fin de stdout
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si 0 document a été indexé (signe de problème)
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique 0 document indexé.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except Exception as e:
    # Gérer les erreurs potentielles
    print(f"\nERREUR pendant l'indexation Baseline: {e}")
    if isinstance(e, subprocess.CalledProcessError):
        print("Sortie STDOUT:\n", e.stdout)
        print("Sortie STDERR:\n", e.stderr)
    else:
        traceback.print_exc()
    raise e

# Vérifier la taille de l'index créé
print(f"\nVérification taille index: {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'" # Commande pour taille dossier
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille: {result_du.stdout.split()[0]}") # Afficher la taille
    except Exception as e_du:
        print(f"  Impossible de vérifier taille: {e_du}")
else:
    print("  ATTENTION: Dossier index non créé.")


# --- Nouvelle Cellule ---

# === Cellule 0: Configuration Complète (avec Stemming) ===
# Installe Java 21, configure comme défaut, installe outils build,
# pybind11, dernière Pyserini, NLTK+ressources, définit chemins,
# FONCTION preprocess_text AVEC STEMMING, parse topics.

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète (avec Stemming) ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try: subprocess.run(install_java_cmd, shell=True, check=True, timeout=180); print("OpenJDK 21 installé.")
except Exception as e: print(f"ERREUR installation Java 21: {e}"); raise

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try: subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True); subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True); print("update-alternatives configuré.")
    except Exception as e: print(f"ERREUR config update-alternatives: {e}")
else: print(f"ATTENTION: Chemin Java 21 non trouvé: {java_path_21}.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]): print(f"ATTENTION: Chemin JAVA_HOME inexistant.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try: subprocess.run(install_build_cmd, shell=True, check=True, timeout=180); print("Outils de build installés.")
except Exception as e_build: print(f"ERREUR installation outils de build: {e_build}")

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q"
try: subprocess.run(install_pybind_cmd, shell=True, check=True, timeout=60); print("pybind11 installé.")
except Exception as e_pybind: print(f"ERREUR installation pybind11: {e_pybind}")

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try: result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600); print("Paquets Python principaux installés.")
except Exception as e_pip: print(f"ERREUR installation pip: {e_pip}"); raise

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab'] # Liste corrigée
for resource in nltk_resources:
    try:
        if resource == 'punkt' or resource == 'punkt_tab': resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4': resource_path = f'corpora/{resource}.zip'
        elif resource == 'wordnet': resource_path = f'corpora/{resource}.zip'
        else: resource_path = f'corpora/{resource}.zip'
        nltk.data.find(resource_path)
    except LookupError:
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try: nltk.download(resource, quiet=True); print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download: print(f"  ERREUR téléchargement '{resource}': {e_download}")
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")
# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

if 'google.colab' in sys.modules:
    try: from google.colab import drive; drive.mount('/content/drive', force_remount=True); print("  Google Drive monté.")
    except Exception as e_mount: print(f"ATTENTION: Erreur montage Drive: {e_mount}")
if not os.path.exists(DRIVE_PROJECT_PATH): raise FileNotFoundError(f"Chemin Drive '{DRIVE_PROJECT_PATH}' inexistant.")
print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar"
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Sera recréé avec stemming
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True); os.makedirs(INDEX_DIR_BASELINE, exist_ok=True); os.makedirs(INDEX_DIR_PREPROC, exist_ok=True);
os.makedirs(CORPUS_DIR, exist_ok=True); os.makedirs(RUN_DIR, exist_ok=True); os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement (AVEC STEMMING) ---
print("\n[8/9] Définition de la fonction preprocess_text (avec Stemming)...")
import nltk
from nltk.corpus import stopwords
# --- Utilisation de PorterStemmer ---
from nltk.stem import PorterStemmer # Import du stemmer
from nltk.tokenize import word_tokenize
import string
stop_words_set_global = set(stopwords.words('english'))
# --- Création de l'objet Stemmer ---
stemmer_obj_global = PorterStemmer() # Création de l'objet
def preprocess_text(text):
    """Applique tokenisation, minuscules, suppression ponctuation/non-alpha, stop words ET STEMMING (Porter)."""
    if not isinstance(text, str): return ""
    try: tokens = word_tokenize(text.lower())
    except LookupError as e_tok: # Gestion erreur si ressource NLTK manque
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]; print(f"--- Tokenizer a besoin de '{resource_name}', tentative téléchargement ---")
              try: nltk.download(resource_name, quiet=True); print(f"--- Ressource '{resource_name}' téléchargée ---"); tokens = word_tokenize(text.lower())
              except Exception as e_dl_tok: print(f"--- Échec téléchargement '{resource_name}': {e_dl_tok} ---"); raise e_tok
         else: raise e_tok
    except Exception as e_tok_other: print(f"Erreur word_tokenize: {e_tok_other}"); raise e_tok_other
    # --- Application du Stemmer ---
    filtered_tokens = [stemmer_obj_global.stem(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie avec PorterStemmer.")
# Tester la nouvelle fonction
sample_text = "This is an example showing Information Retrieval with stemming and stop words removal."
stemmed_sample = preprocess_text(sample_text)
print(f"  Exemple Stemmed: {stemmed_sample}") # Doit afficher 'thi is exampl show inform retriev with stem and stop word remov.'

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
import re
import glob
def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {};
    try:
        with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
        for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
            topic_content = top_match.group(1)
            num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE); topic_id = num_match.group(1).strip() if num_match else None
            title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL); title = title_match.group(1).strip() if title_match else ""
            desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL); desc = desc_match.group(1).strip() if desc_match else ""
            if topic_id and title: topics[topic_id] = {'title': title, 'desc': desc}
    except Exception as e_topic: print(f"  ATTENTION: Erreur parsing {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR): print(f"ATTENTION: Dossier topics '{TOPICS_DIR}' inexistant."); topic_files = []
else: topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
all_topics = {}
if not topic_files: print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else: print(f"  Parsing fichiers topics: {topic_files}"); [all_topics.update(parse_topics(tf)) for tf in topic_files]

try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés."); print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes (avec stemming)...")
    # Appliquer la NOUVELLE fonction preprocess_text (avec stemming)
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries: print(f"\nERREUR prétraitement requêtes: {e_preproc_queries}"); queries_short_preprocessed, queries_long_preprocessed = {}, {}

# --- Vérification Finale Java ---
print("\n--- Vérification Finale Version Java Active ---")
try: result = subprocess.run("java -version", shell=True, check=True, capture_output=True, text=True, timeout=10); print("STDERR:\n", result.stderr); print("\nConfirmation: Java 21 OK." if "21." in result.stderr else "\nATTENTION: Java 21 NON ACTIF ?!")
except Exception as e: print(f"\nERREUR vérification Java: {e}")
# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale Version Pyserini Installée ---")
try: result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30); print(result_pyserini.stdout)
except Exception as e: print(f"ERREUR vérification Pyserini: {e}")

print("\n--- Configuration Complète (avec Stemming) Terminée ---")
print("\nPause..."); time.sleep(2); print("Prêt.")



# --- Nouvelle Cellule ---

# === Cellule 1: Extraire, Décompresser et Formater les Documents ===
# Lit AP.tar, décompresse les .gz internes, extrait <DOC>, <DOCNO>, <TEXT>
# et écrit le résultat dans ap_docs.jsonl.

import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    AP_TAR_PATH
    CORPUS_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.") # Devrait être ~275Mo

# Regex pour extraire les infos
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteurs
doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

try:
    # Ouvrir le fichier de sortie et l'archive TAR
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.") # Devrait être ~1051

        # Boucler sur chaque membre de l'archive
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer si ce n'est pas un fichier .gz ou .Z
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Réinitialiser pour chaque fichier

            try:
                # Extraire le contenu compressé
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # Décompresser le contenu
                    try:
                        content_bytes = gzip.decompress(compressed_content)
                        content = content_bytes.decode('utf-8', errors='ignore') # Décoder après décompression
                    except gzip.BadGzipFile: # Gérer si ce n'est pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au suivant

                    # Trouver tous les blocs <DOC> dans le contenu décompressé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches: continue # Passer si aucun doc trouvé

                    # Boucler sur chaque document trouvé
                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match: continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        # Nettoyer le texte (espaces multiples)
                        doc_text = ' '.join(text_match.group(1).strip().split()) if text_match else ""

                        # Écrire la ligne JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key: print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}"); skipped_members += 1
            except EOFError: print(f"\nAvertissement: Fin fichier inattendue {member.name}."); skipped_members += 1
            except Exception as e_extract: print(f"\nErreur extraction/lecture {member.name}: {e_extract}"); skipped_members += 1

except tarfile.ReadError as e_tar: print(f"\nERREUR lecture TAR {AP_TAR_PATH}: {e_tar}"); raise e_tar
except FileNotFoundError: print(f"\nERREUR: Fichier TAR {AP_TAR_PATH} non trouvé."); raise FileNotFoundError
except Exception as e_general: print(f"\nERREUR générale traitement TAR: {e_general}"); traceback.print_exc(); raise e_general

# Afficher le résumé de l'extraction
print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0: print(f"  {decompression_errors} erreurs/avertissements décompression.")
print(f"  {doc_count} documents écrits dans {JSONL_OUTPUT_PATH}") # Devrait être ~240k

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale {JSONL_OUTPUT_PATH}: {output_size} octets.") # Devrait être ~600Mo
    if output_size > 0 and doc_count > 0: print("  SUCCÈS: Fichier de sortie contient des données.")
    else: print("  ATTENTION: Fichier de sortie vide ou aucun document écrit.")
else: print(f"  ATTENTION: Fichier {JSONL_OUTPUT_PATH} non créé.")



# --- Nouvelle Cellule ---

# === Cellule 2: Indexation Baseline ===
# Crée l'index Lucene à partir de ap_docs.jsonl (sans prétraitement spécifique).

import os
import subprocess
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Début de l'indexation Baseline...")
print(f"Dossier source: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction a échoué.")

# Commande Pyserini
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Nombre de threads pour l'indexation
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options de stockage
]

print(f"Exécution: {' '.join(index_cmd_baseline)}")
try:
    # Exécuter la commande d'indexation
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:]) # Afficher la fin de stdout
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si 0 document a été indexé (signe de problème)
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique 0 document indexé.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except Exception as e:
    # Gérer les erreurs potentielles
    print(f"\nERREUR pendant l'indexation Baseline: {e}")
    if isinstance(e, subprocess.CalledProcessError):
        print("Sortie STDOUT:\n", e.stdout)
        print("Sortie STDERR:\n", e.stderr)
    else:
        traceback.print_exc()
    raise e

# Vérifier la taille de l'index créé
print(f"\nVérification taille index: {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'" # Commande pour taille dossier
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille: {result_du.stdout.split()[0]}") # Afficher la taille
    except Exception as e_du:
        print(f"  Impossible de vérifier taille: {e_du}")
else:
    print("  ATTENTION: Dossier index non créé.")


# --- Nouvelle Cellule ---

# === Cellule de Vérification du Contenu du Dossier Runs ===
# Utilise les commandes shell de Colab préfixées par '!'

# Chemin exact où les résultats de recherche sont attendus
# (Défini dans la cellule de configuration complète)
RUN_DIR_PATH="/content/ap_output/runs/"

# Utiliser '!' pour exécuter la commande shell 'echo'
print(f"Vérification du contenu de : {RUN_DIR_PATH}")

# Utiliser '!' pour exécuter la commande shell 'ls -l'
# Mettre le chemin entre guillemets pour gérer les espaces potentiels (même s'il n'y en a pas ici)
!ls -lh "{RUN_DIR_PATH}"



# --- Nouvelle Cellule ---

    # === Monter Google Drive ===
    from google.colab import drive
    import os

    try:
        print("Tentative de montage de Google Drive...")
        drive.mount('/content/drive', force_remount=True) # force_remount=True est utile en cas de problème antérieur

        # Vérifier si le point de montage de base existe après la tentative
        if os.path.exists('/content/drive/My Drive'):
            print("\nGoogle Drive monté avec succès sur /content/drive !")
        else:
            print("\nATTENTION: Le montage semble avoir échoué (vérifiez les messages ci-dessus et la fenêtre d'autorisation).")

    except Exception as e:
        print(f"\nUne erreur s'est produite lors du montage de Drive: {e}")



# --- Nouvelle Cellule ---

from multiprocessing import Pool
from google.colab import drive
import os

drive.mount("/content/drive")

# Create the directory if it doesn't exist
target_dir = "/content/drive/MyDrive/Projet_RI"  # Changed 'myDrive' to 'MyDrive'
if not os.path.exists(target_dir):
    try:
        os.makedirs(target_dir, exist_ok=True)  # Use exist_ok to avoid error if directory exists
        print(f"Directory '{target_dir}' created.")
    except FileExistsError:
        print(f"Directory '{target_dir}' already exists.")
else:
    print(f"Directory '{target_dir}' already exists.")

os.chdir(target_dir)

def process_file(file):
    # Votre code de prétraitement ici
    # Example: Assuming you want to read the file and return its content
    file_path = os.path.join("AP_Final", file) # Construct the full file path
    # Specify the encoding when opening the file
    with open(file_path, 'r', encoding='latin-1') as f:  # Try 'latin-1' or 'cp1252'
        preprocessed_text = f.read()  # Assign a value to preprocessed_text
    return preprocessed_text

if __name__ == "__main__":
    files = os.listdir("AP_Final")
    with Pool(os.cpu_count()) as p:  # Utilise tous les cœurs
        results = p.map(process_file, files)

# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (Séquentielle d'abord) ===
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc()) # Affiche la trace complète de l'erreur Java
                 raise # Arrête l'exécution pour ce run si la similarité ne peut être définie
        else:
            print("  Configuration BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante

        # Écrire les résultats dans le fichier de run TREC
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        # Important: Fermer le searcher pour libérer les ressources Java, même en cas d'erreur
        if searcher:
             try:
                 # Note: Pyserini ne semble pas avoir de méthode close() explicite sur LuceneSearcher
                 # La JVM devrait se nettoyer, mais c'est une bonne pratique si disponible
                 # searcher.close() # Décommentez si une telle méthode existe dans votre version
                 print(f"  Nettoyage implicite des ressources pour {run_tag}.")
                 pass
             except Exception as e_close:
                 print(f"  Erreur lors de la tentative de fermeture du searcher pour {run_tag}: {e_close}")


# --- Exécution des différentes configurations (en mode séquentiel) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel) sont terminées.")

# --- Note importante ---
# Si cette cellule s'exécute sans planter (même si c'est lent),
# le problème est probablement lié à la parallélisation (mémoire/conflits JVM).
# Si elle plante encore, surtout lors des runs 'tfidf',
# le problème pourrait être lié à ClassicSimilarity ou à l'environnement Java lui-même.


# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (BM25 Séquentiel Uniquement) ===
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
# On importe toujours ClassicSimilarity au cas où, mais on ne l'utilisera pas dans ce test
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    # --- Vérification ajoutée : Ne traiter que BM25 pour ce test ---
    if model != 'bm25':
        print(f"--- Run '{run_tag}' ignoré (Test BM25 uniquement) ---")
        return # Ne rien faire si ce n'est pas BM25

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité (seulement BM25 ici)
        print("  Configuration de BM25...")
        searcher.set_bm25(k1=0.9, b=0.4)
        print("  BM25 configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante

        # Écrire les résultats dans le fichier de run TREC
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        if searcher:
             try:
                 print(f"  Nettoyage implicite des ressources pour {run_tag}.")
                 pass
             except Exception as e_close:
                 print(f"  Erreur lors de la tentative de fermeture du searcher pour {run_tag}: {e_close}")


# --- Exécution des différentes configurations (BM25 seulement) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel, BM25 uniquement) sont terminées.")

# --- Note importante ---
# Si cette cellule s'exécute sans planter, le problème est très probablement lié
# à l'utilisation de ClassicSimilarity (TF-IDF) dans l'environnement Java actuel.
# Si elle plante encore, le problème est plus profond avec l'initialisation de LuceneSearcher.


# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (Séquentielle - BM25 & TF-IDF) ===
# Utilise Pyserini 0.23.0
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4) # Utilise les paramètres BM25 par défaut de Pyserini
            print("  BM25 configuré.")
        elif model == 'tfidf':
            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 # Tentative de configuration de ClassicSimilarity
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc()) # Affiche la trace complète de l'erreur Java
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return # Arrête l'exécution pour ce run spécifique si TF-IDF échoue
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return # Arrête l'exécution pour ce run spécifique
        else:
            # Sécurité : si le modèle n'est ni bm25 ni tfidf, utilise bm25 par défaut
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante même si une échoue

        # Écrire les résultats dans le fichier de run TREC (seulement si aucune erreur majeure n'est survenue avant la boucle)
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25 (peu probable maintenant)
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        # Nettoyage implicite (Pyserini gère la fermeture de la JVM)
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des différentes configurations (Séquentiel - BM25 & TF-IDF) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel - BM25 & TF-IDF tentative) sont terminées.")

# --- Note importante ---
# Surveillez la sortie lors de l'exécution des runs 'tfidf'.
# Si vous voyez des erreurs Java ou si le kernel plante à nouveau,
# cela signifie que ClassicSimilarity est toujours problématique.


# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
import os # Assurez-vous que os est importé

# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE SI NÉCESSAIRE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI/TREC/"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin.")

AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence") # Définit QRELS_DIR

# Chemins pour les sorties (index, résultats, etc.) dans l'environnement Colab
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus") # Pour les documents extraits/formatés
RUN_DIR = os.path.join(OUTPUT_DIR, "runs") # Définit RUN_DIR
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval") # Définit EVAL_DIR

# Créer les répertoires de sortie s'ils n'existent pas déjà
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

print(f"Chemin du projet Drive: {DRIVE_PROJECT_PATH}")
print(f"Répertoire de sortie Colab: {OUTPUT_DIR}")
print(f"Chemin Qrels: {QRELS_DIR}") # Vérifie que QRELS_DIR est défini
print(f"Chemin Runs: {RUN_DIR}")
print(f"Chemin Eval: {EVAL_DIR}")


# --- Nouvelle Cellule ---

# === Cellule 4.1 & 4.2: Préparation Qrels et Évaluation des Runs ===
import pandas as pd
import glob
import pytrec_eval
import os # Assurez-vous que os est importé
import traceback # Pour afficher les erreurs détaillées

# --- 4.1: Préparer le Fichier Qrels Combiné ---

# Chemins définis précédemment dans la Cellule 0.3 (qui vient d'être exécutée avec succès)
# QRELS_DIR, RUN_DIR, EVAL_DIR devraient être définis

print(f"Préparation des Qrels depuis: {QRELS_DIR}")
qrels_files = sorted(glob.glob(os.path.join(QRELS_DIR, "qrels.*.txt")))
if not qrels_files:
    print(f"ATTENTION: Aucun fichier Qrels trouvé dans {QRELS_DIR}. Vérifiez le chemin.")
else:
    print(f"Fichiers Qrels trouvés: {qrels_files}")

all_qrels_data = []
for qf in qrels_files:
    try:
        # Lire le fichier qrels: query_id unused doc_id relevance
        # S'assurer que les IDs sont lus comme des chaînes de caractères
        qrels_df = pd.read_csv(qf, sep='\s+', names=['query_id', 'unused', 'doc_id', 'relevance'],
                               dtype={'query_id': str, 'unused': str, 'doc_id': str, 'relevance': int})
        all_qrels_data.append(qrels_df[['query_id', 'doc_id', 'relevance']]) # Garder seulement les colonnes utiles
    except Exception as e:
        print(f"Erreur lors de la lecture du fichier Qrels {qf}: {e}")


if not all_qrels_data:
     print("ERREUR: Impossible de lire les données Qrels. Vérifiez les fichiers et les chemins.")
     # Arrêter ici si les qrels ne peuvent pas être chargés
     raise ValueError("Données Qrels non chargées.")
else:
    combined_qrels_df = pd.concat(all_qrels_data, ignore_index=True)

    # Convertir en dictionnaire format pytrec_eval: {query_id: {doc_id: relevance}}
    qrels_dict = {}
    for _, row in combined_qrels_df.iterrows():
        qid = row['query_id']
        did = row['doc_id']
        # Assurer que la pertinence est bien un entier
        try:
            rel = int(row['relevance'])
        except ValueError:
            print(f"Avertissement: Valeur de pertinence non entière ignorée pour qid={qid}, did={did}: {row['relevance']}")
            continue

        # Filtrer les jugements non binaires si nécessaire (garder 0 et 1, ou > 0 pour pertinent)
        if rel < 0: # Ignorer les jugements négatifs si présents
             continue

        if qid not in qrels_dict:
            qrels_dict[qid] = {}
        # Stocker la pertinence (pytrec_eval gère différents niveaux, mais ici 0=non pertinent, >0=pertinent)
        qrels_dict[qid][did] = rel

    print(f"Total de {len(qrels_dict)} requêtes avec jugements dans le fichier Qrels combiné.")
    qrels_doc_count = sum(len(docs) for docs in qrels_dict.values())
    print(f"Nombre total de jugements pertinents/non pertinents chargés: {qrels_doc_count}")


    # --- 4.2: Évaluation des Runs ---

    # Mesures à calculer (standard TREC)
    measures = {'map', 'P_10'} # MAP (mean average precision), Precision at 10

    # Initialiser l'évaluateur avec les qrels et les mesures
    # Utiliser seulement les query_ids présents dans les qrels pour l'évaluation
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, measures)

    # Trouver tous les fichiers de run générés dans RUN_DIR
    run_files = sorted(glob.glob(os.path.join(RUN_DIR, "*.txt")))
    print(f"\nFichiers de run à évaluer trouvés dans {RUN_DIR}: {len(run_files)}")
    # print(run_files) # Décommentez pour voir la liste exacte

    results_summary = [] # Pour stocker les résultats pour le tableau final

    if not run_files:
        print(f"ATTENTION: Aucun fichier de run (.txt) trouvé dans {RUN_DIR}. Vérifiez que l'étape 3 a bien généré des fichiers.")
    else:
        for run_file in run_files:
            run_name = os.path.basename(run_file)
            print(f"\n--- Évaluation de: {run_name} ---")

            # Charger le fichier de run au format TREC
            # pytrec_eval attend un dictionnaire: {query_id: {doc_id: score}}
            run_dict = {}
            line_count = 0
            error_count = 0
            try:
                with open(run_file, 'r') as f_run:
                    for line in f_run:
                        line_count += 1
                        parts = line.strip().split()
                        if len(parts) != 6:
                             # print(f"Ligne mal formatée ignorée dans {run_name} (ligne {line_count}): {line.strip()}")
                             error_count += 1
                             continue
                        qid, _, did, rank, score, _ = parts
                        # Assurer que l'ID de requête est une chaîne, comme dans qrels_dict
                        qid = str(qid)
                        # Assurer que le score est un float
                        try:
                            score = float(score)
                        except ValueError:
                            # print(f"Score non flottant ignoré dans {run_name} (ligne {line_count}): {score}")
                            error_count += 1
                            continue

                        if qid not in run_dict:
                            run_dict[qid] = {}
                        run_dict[qid][did] = score

                if error_count > 0:
                    print(f"  Avertissement: {error_count} lignes mal formatées ignorées dans {run_name}.")

                # Filtrer le run_dict pour ne garder que les query_ids présents dans qrels_dict
                filtered_run_dict = {qid: docs for qid, docs in run_dict.items() if qid in qrels_dict}
                ignored_queries = len(run_dict) - len(filtered_run_dict)
                if ignored_queries > 0:
                    print(f"  Avertissement: {ignored_queries} requêtes du run ignorées car absentes des Qrels.")

                if not filtered_run_dict:
                     print("  Erreur: Aucune requête du run ne correspond aux Qrels. Impossible d'évaluer.")
                     continue

                # Effectuer l'évaluation sur les données filtrées
                eval_results = evaluator.evaluate(filtered_run_dict)

                # Calculer les moyennes sur toutes les requêtes évaluées
                # Gérer le cas où une métrique pourrait manquer pour une requête (peu probable avec MAP, P@10)
                all_maps = [q_res.get("map", 0) for q_res in eval_results.values()]
                all_p10s = [q_res.get("P_10", 0) for q_res in eval_results.values()]

                # Éviter la division par zéro si aucune requête n'a pu être évaluée
                avg_map = sum(all_maps) / len(all_maps) if all_maps else 0
                avg_p10 = sum(all_p10s) / len(all_p10s) if all_p10s else 0

                print(f"  MAP: {avg_map:.4f}")
                print(f"  P@10: {avg_p10:.4f}")
                print("-" * (20 + len(run_name)))

                # Extraire les informations pour le tableau récapitulatif
                # Gère les noms de fichiers comme 'baseline_short_bm25.txt' ou 'preproc_long_tfidf.txt'
                parts = run_name.replace('.txt','').split('_')
                if len(parts) >= 3:
                    index_type = parts[0] # baseline ou preproc
                    query_type = parts[1] # short ou long
                    model_type = parts[2] # bm25 ou tfidf
                    # Gérer le cas RM3 si on l'ajoute plus tard
                    if len(parts) > 3 and parts[3] == 'rm3':
                         model_type += "+RM3"

                    results_summary.append({
                        "Run Name": run_name,
                        "Index": index_type,
                        "Query Type": query_type.capitalize(), # Met la première lettre en majuscule
                        "Weighting Scheme": model_type.upper(), # Met en majuscules (BM25, TFIDF)
                        "MAP": avg_map,
                        "P@10": avg_p10
                    })
                else:
                     print(f"  Avertissement: Impossible de parser le nom du run '{run_name}' pour le résumé.")

            except FileNotFoundError:
                 print(f"  Erreur: Fichier run non trouvé: {run_file}")
            except Exception as e:
                 print(f"  Erreur lors de l'évaluation de {run_name}: {e}")
                 print(traceback.format_exc())

        # Afficher le tableau récapitulatif si des résultats ont été collectés
        if results_summary:
            print("\n\n=== Tableau Récapitulatif des Résultats (Partie 1) ===")
            results_df = pd.DataFrame(results_summary)

            # Pivoter pour obtenir le format demandé (plus ou moins)
            try:
                pivot_map = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='MAP')
                print("\n--- MAP (Moyenne des Précisions Moyennes) ---")
                print(pivot_map.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot_map:
                 print(f"\nErreur lors de la création du tableau pivot MAP: {e_pivot_map}")
                 print("Affichage du DataFrame brut MAP:")
                 print(results_df[['Query Type', 'Weighting Scheme', 'Index', 'MAP']].to_markdown(index=False, floatfmt=".4f"))


            try:
                pivot_p10 = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='P@10')
                print("\n--- P@10 (Précision aux 10 premiers documents) ---")
                print(pivot_p10.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot_p10:
                 print(f"\nErreur lors de la création du tableau pivot P@10: {e_pivot_p10}")
                 print("Affichage du DataFrame brut P@10:")
                 print(results_df[['Query Type', 'Weighting Scheme', 'Index', 'P@10']].to_markdown(index=False, floatfmt=".4f"))


            # Sauvegarder le DataFrame pour une utilisation ultérieure (ex: rapport)
            summary_file_path = os.path.join(EVAL_DIR, "evaluation_summary_part1.csv")
            try:
                 results_df.to_csv(summary_file_path, index=False)
                 print(f"\nTableau récapitulatif sauvegardé dans {summary_file_path}")
            except Exception as e_save:
                 print(f"\nErreur lors de la sauvegarde du résumé dans {summary_file_path}: {e_save}")

        else:
            print("\nAucun résultat d'évaluation à afficher ou sauvegarder.")



# --- Nouvelle Cellule ---

    # === Cellule de Vérification Java (à exécuter JUSTE AVANT la Cellule 5.1 / rm3_run_code) ===
    # Ceci vérifie quelle version de Java le kernel Python voit ACTUELLEMENT
    print("--- Vérification de la version Java vue par le kernel ACTUEL ---")
    !java -version
    print("-------------------------------------------------------------")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Chemins Corrigés) ===
import os
import sys
import subprocess
import time

print("--- Début de la Configuration Complète ---")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
subprocess.run(install_java_cmd, shell=True, check=True)
print("OpenJDK 21 installé.")

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
    subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
    print("update-alternatives configuré pour java.")
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11"
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        nltk.data.find(f'corpora/{resource}.zip') if resource != 'punkt' else nltk.data.find(f'tokenizers/{resource}.zip')
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! CHEMIN CORRIGÉ SELON VOS INDICATIONS !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # Chemin vers le sous-dossier TREC

# --- Le reste du code vérifie le chemin et définit les autres variables ---
if not os.path.exists(DRIVE_PROJECT_PATH):
    try:
        from google.colab import drive
        print("  Montage de Google Drive...")
        drive.mount('/content/drive', force_remount=True)
        if not os.path.exists(DRIVE_PROJECT_PATH):
             raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe TOUJOURS PAS après montage. Vérifiez le chemin exact et le nom des dossiers.")
    except ModuleNotFoundError:
         raise FileNotFoundError(f"Google Colab Drive non trouvé et chemin '{DRIVE_PROJECT_PATH}' inexistant.")
    except Exception as e_mount:
         raise FileNotFoundError(f"Erreur lors du montage de Drive ou chemin '{DRIVE_PROJECT_PATH}' inexistant: {e_mount}")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

# !!! NOM DE FICHIER CORRIGÉ SELON VOS INDICATIONS !!!
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Utilise AP.tar au lieu de AP.tar.gz
# Note: Pensez à modifier la Cellule 0.4 (extraction) pour ouvrir avec "r:" au lieu de "r:gz"

TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")


# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
import glob
import re
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics
topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")


# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR:\n", result.stderr) # Version souvent sur stderr
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")


# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except ImportError:
    print("ERREUR: Impossible d'importer Pyserini après l'installation.")
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")


print("\n--- Configuration Complète Terminée ---")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Chemins Corrigés) ===
# ... (début de la cellule inchangé : installation Java, build tools, pip, etc.) ...

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! VÉRIFIEZ CE CHEMIN VERS LE DOSSIER CONTENANT AP.tar !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # Est-ce le bon dossier ?

# --- Le reste du code vérifie le chemin et définit les autres variables ---
if not os.path.exists(DRIVE_PROJECT_PATH):
    try:
        from google.colab import drive
        print("  Montage de Google Drive...")
        drive.mount('/content/drive', force_remount=True)
        if not os.path.exists(DRIVE_PROJECT_PATH):
             raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe TOUJOURS PAS après montage. Vérifiez le chemin exact et le nom des dossiers.")
    except ModuleNotFoundError:
         raise FileNotFoundError(f"Google Colab Drive non trouvé et chemin '{DRIVE_PROJECT_PATH}' inexistant.")
    except Exception as e_mount:
         raise FileNotFoundError(f"Erreur lors du montage de Drive ou chemin '{DRIVE_PROJECT_PATH}' inexistant: {e_mount}")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

# !!! VÉRIFIEZ CE NOM DE FICHIER EXACT !!!
AP_TAR_FILENAME = "AP.tar" # Est-ce bien 'AP.tar' ? Ou 'ap.tar' ? Autre ?
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
# Note: Pensez à modifier la Cellule 0.4 (extraction) pour ouvrir avec "r:" au lieu de "r:gz"

TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes") # Ces sous-dossiers existent-ils ?
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence") # Ces sous-dossiers existent-ils ?
OUTPUT_DIR = "/content/ap_output"
# ... (définition des autres chemins inchangée) ...
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}") # Affiche le chemin complet qui sera vérifié
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# ... (reste de la cellule inchangé : définition preprocess_text, parsing topics, vérifications finales) ...



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire et Formater les Documents depuis AP.tar ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression
import os # Assurer que os est importé
import traceback # Pour afficher les erreurs

# Chemins définis dans la cellule précédente (combined_setup_paths_fixed)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe (devrait être bon maintenant, mais double vérification)
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé. Vérifiez le chemin et le nom du fichier dans la cellule de configuration.")

# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0

# Ouvrir/créer le fichier JSONL de sortie
# Utiliser le mode "r:" pour un fichier .tar non compressé
try:
    # Utiliser encoding='utf-8' pour l'écriture du fichier JSONL
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"  {len(members)} membres trouvés dans l'archive TAR.")
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile():
                skipped_members += 1
                continue

            file_read_count += 1
            # Extraire le contenu du fichier
            try:
                f = tar.extractfile(member)
                if f: # S'assurer que l'extraction a réussi
                    # Lire et décoder avec gestion des erreurs
                    content = f.read().decode('utf-8', errors='ignore')
                    f.close()

                    # Trouver tous les documents (<DOC>...</DOC>) dans le fichier actuel
                    for doc_match in doc_pattern.finditer(content):
                        doc_content = doc_match.group(1)

                        # Extraire DOCNO
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        # Extraire TEXT
                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split()) # Nettoyage espaces
                        else:
                            doc_text = ""

                        # Écrire l'entrée JSONL
                        try:
                            json_line = json.dumps({"id": doc_id, "contents": doc_text})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                # Peut arriver si le membre est listé mais inaccessible
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Est-il corrompu ou n'est-ce pas un fichier TAR valide? Erreur: {e_tar}")
    raise e_tar # Arrêter si le TAR est illisible
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé au moment de l'ouverture.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement du fichier TAR: {e_general}")
     traceback.print_exc()
     raise e_general


print(f"\nTraitement terminé.")
print(f"  {file_read_count} fichiers lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés (dossiers ou erreurs).")
print(f"  {doc_count} documents formatés et écrits dans {JSONL_OUTPUT_PATH}")
if doc_count < 100000: # Seuil arbitraire pour AP
     print("  ATTENTION: Le nombre de documents extraits semble faible. Vérifiez le fichier TAR et les regex.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
print(f"Collection source: {JSONL_OUTPUT_PATH}")
print(f"Répertoire de l'index: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# -input: dossier contenant les fichiers JSONL (ici CORPUS_DIR)
# -collection: type de collection (JsonCollection pour nos fichiers .jsonl)
# -generator: comment traiter les fichiers (DefaultLuceneDocumentGenerator crée un document Lucene par ligne JSON)
# -index: chemin où sauvegarder l'index
# -threads: nombre de threads à utiliser (ajustez si besoin, 4 est raisonnable pour Colab)
# -storePositions -storeDocvectors -storeRaw: stocke informations supplémentaires utiles
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw"
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    # Vous pouvez décider de lever l'erreur pour arrêter ou juste afficher un message
    # raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    # raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    import traceback
    traceback.print_exc()
    # raise e



# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
import json
from tqdm.notebook import tqdm
import os
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
# CORPUS_DIR

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# S'assurer que la fonction preprocess_text est définie (normalement fait dans la cellule de setup)
if 'preprocess_text' not in globals():
    print("Erreur: La fonction 'preprocess_text' n'est pas définie. Ré-exécutez la cellule de configuration.")
    # Optionnel: Redéfinir ici si nécessaire (copier depuis la cellule de setup)
    raise NameError("preprocess_text non définie")
else:
    doc_count_preproc = 0
    error_count = 0
    # Lire le fichier JSONL original et écrire le fichier prétraité
    try:
        # Utiliser utf-8 pour lire et écrire
        with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
             open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:

            # Itérer sur le fichier d'entrée
            # Utiliser tqdm pour la barre de progression
            for line in tqdm(infile, desc="Prétraitement des documents"):
                try:
                    data = json.loads(line)
                    # Utiliser .get pour la robustesse si 'id' ou 'contents' manque
                    doc_id = data.get('id', None)
                    original_contents = data.get('contents', '')

                    if doc_id is None:
                        # print("Avertissement: Ligne JSON sans 'id', ignorée.")
                        error_count += 1
                        continue

                    # Appliquer le prétraitement
                    preprocessed_contents = preprocess_text(original_contents)

                    # Écrire la nouvelle ligne JSONL
                    # S'assurer que l'ID est une chaîne et le contenu aussi
                    json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                    outfile.write(json_line + '\n')
                    doc_count_preproc += 1

                except json.JSONDecodeError:
                    # print(f"Avertissement: Erreur de décodage JSON sur une ligne, ignorée.")
                    error_count += 1
                except Exception as e_line:
                    print(f"\nErreur inattendue lors du prétraitement d'une ligne (id={data.get('id', 'inconnu')}): {e_line}")
                    error_count += 1
                    # Optionnel: Afficher la trace pour débugger des erreurs spécifiques
                    # traceback.print_exc()


        print(f"\nTerminé.")
        print(f"  {doc_count_preproc} documents prétraités et écrits dans {JSONL_PREPROC_PATH}")
        if error_count > 0:
             print(f"  {error_count} lignes ignorées à cause d'erreurs.")

    except FileNotFoundError:
        print(f"ERREUR: Le fichier d'entrée {JSONL_OUTPUT_PATH} n'a pas été trouvé.")
        raise
    except Exception as e_main:
        print(f"ERREUR générale lors de la préparation des données prétraitées: {e_main}")
        traceback.print_exc()
        raise



# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source
# INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Dossier cible pour l'index
# CORPUS_DIR contient le fichier JSONL prétraité

print(f"Début de l'indexation avec Prétraitement...")
# Note: Pyserini s'attend à un dossier en entrée pour JsonCollection,
# il trouvera ap_docs_preprocessed.jsonl dans CORPUS_DIR.
print(f"Collection source (dossier): {CORPUS_DIR}")
print(f"Fichier JSONL prétraité attendu: {JSONL_PREPROC_PATH}")
print(f"Répertoire de l'index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe
if not os.path.exists(JSONL_PREPROC_PATH):
    raise FileNotFoundError(f"Le fichier de données prétraitées {JSONL_PREPROC_PATH} n'a pas été trouvé. Assurez-vous que l'étape précédente (1.3) s'est bien terminée.")

# Commande Pyserini pour l'indexation prétraitée
index_cmd_preproc = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR, # Pointeur vers le dossier contenant les jsonl
    "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Important: Indique que le texte est déjà tokenisé/traité
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_preproc)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion
    result = subprocess.run(index_cmd_preproc, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Prétraitée a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Prétraitée a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Prétraitée: {e}")
    traceback.print_exc()
    raise e



# --- Nouvelle Cellule ---

# === Cellule 3.1: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Utilise la dernière Pyserini et Java 21
# Assurez-vous que les variables d'index et de requêtes sont définies par la cellule de config
# INDEX_DIR_BASELINE, INDEX_DIR_PREPROC
# queries_short, queries_long, queries_short_preprocessed, queries_long_preprocessed
# K_RESULTS devrait aussi être défini (sinon, on le mettra à 1000)

from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées
import os # Assurer que os est importé
from jnius import autoclass, JavaException # Importer pour TF-IDF

# Essayer de définir K_RESULTS si ce n'est pas déjà fait
try:
    K_RESULTS
except NameError:
    print("Définition de K_RESULTS (nombre de résultats) à 1000...")
    K_RESULTS = 1000

# --- Configuration des modèles de similarité ---
# Charger la classe Java pour TF-IDF (ClassicSimilarity)
# Mettre dans un try-except au cas où l'import échouerait encore (peu probable avec Java 21)
try:
    ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
    print("Classe ClassicSimilarity (pour TF-IDF) chargée avec succès.")
except JavaException as e_load_class:
    print(f"ERREUR Java lors du chargement de ClassicSimilarity: {e_load_class}")
    print("Les recherches TF-IDF échoueront probablement.")
    ClassicSimilarity = None # Mettre à None pour pouvoir vérifier plus tard
except Exception as e_load_gen:
     print(f"ERREUR inattendue lors du chargement de ClassicSimilarity: {e_load_gen}")
     ClassicSimilarity = None


def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        # Assurer que LuceneSearcher est importé
        from pyserini.search.lucene import LuceneSearcher
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None:
                 print("ERREUR: Classe ClassicSimilarity non chargée. Impossible de configurer TF-IDF.")
                 print(f"--- ABANDON du run {run_tag} ---")
                 return # Ne pas continuer si la classe n'a pas pu être chargée

            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return
        else:
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # S'assurer que preprocess_text est défini
                if 'preprocess_text' not in globals():
                     raise NameError("La fonction preprocess_text n'est pas définie.")

                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                # Compter les erreurs par requête mais continuer
                query_errors += 1
                if query_errors < 10: # Limiter l'affichage des erreurs par requête
                     print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                elif query_errors == 10:
                     print("\nPlusieurs erreurs de recherche pour ce run, messages suivants masqués...")


        # Écrire les résultats dans le fichier de run TREC
        if all_results_list:
             with open(output_run_file, 'w', encoding='utf-8') as f_out:
                f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes de résultats écrites.")
        else:
            print("\n  Avertissement: Aucun résultat généré pour ce run.")

        if query_errors > 0:
            print(f"  Avertissement: {query_errors} erreurs rencontrées lors de la recherche sur les requêtes individuelles.")

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc())
    finally:
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des 8 configurations de recherche (Séquentiel) ---

print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Tout-en-un) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Essayer de trouver la ressource pour éviter le re-téléchargement inutile
        if resource == 'punkt':
            nltk.data.find(f'tokenizers/{resource}.zip')
        elif resource == 'omw-1.4':
             nltk.data.find(f'corpora/{resource}.zip')
        else:
            nltk.data.find(f'corpora/{resource}.zip')
        # print(f"  Ressource NLTK '{resource}' déjà présente.")
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
stop_words_set = set(stopwords.words('english'))
lemmatizer_obj = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire et Formater les Documents depuis AP.tar ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression
import os # Assurer que os est importé
import traceback # Pour afficher les erreurs

# Chemins définis dans la cellule précédente (full_setup_code)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe (devrait être bon maintenant)
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé. Vérifiez le chemin et le nom du fichier dans la cellule de configuration.")

# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0

# Ouvrir/créer le fichier JSONL de sortie
# Utiliser le mode "r:" pour un fichier .tar non compressé
try:
    # Utiliser encoding='utf-8' pour l'écriture du fichier JSONL
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"  {len(members)} membres trouvés dans l'archive TAR.")
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile():
                skipped_members += 1
                continue

            file_read_count += 1
            # Extraire le contenu du fichier
            try:
                f = tar.extractfile(member)
                if f: # S'assurer que l'extraction a réussi
                    # Lire et décoder avec gestion des erreurs
                    content = f.read().decode('utf-8', errors='ignore')
                    f.close()

                    # Trouver tous les documents (<DOC>...</DOC>) dans le fichier actuel
                    for doc_match in doc_pattern.finditer(content):
                        doc_content = doc_match.group(1)

                        # Extraire DOCNO
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        # Extraire TEXT
                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split()) # Nettoyage espaces
                        else:
                            doc_text = ""

                        # Écrire l'entrée JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)}) # Assurer str
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                # Peut arriver si le membre est listé mais inaccessible
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Est-il corrompu ou n'est-ce pas un fichier TAR valide? Erreur: {e_tar}")
    raise e_tar # Arrêter si le TAR est illisible
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé au moment de l'ouverture.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement du fichier TAR: {e_general}")
     traceback.print_exc()
     raise e_general


print(f"\nTraitement terminé.")
print(f"  {file_read_count} fichiers lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés (dossiers ou erreurs).")
print(f"  {doc_count} documents formatés et écrits dans {JSONL_OUTPUT_PATH}")
if doc_count < 100000: # Seuil arbitraire pour AP
     print("  ATTENTION: Le nombre de documents extraits semble faible. Vérifiez le fichier TAR et les regex.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e



# --- Nouvelle Cellule ---

# === Cellule de Vérification et Nettoyage du Corpus ===
import os
import subprocess

print("--- Vérification du contenu du dossier Corpus ---")

# Redéfinir CORPUS_DIR au cas où (normalement défini dans la config)
OUTPUT_DIR = "/content/ap_output"
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")

# Vérifier si le dossier CORPUS_DIR existe
if not os.path.exists(CORPUS_DIR):
    print(f"ERREUR: Le dossier {CORPUS_DIR} n'existe pas. L'étape d'extraction a peut-être échoué.")
else:
    print(f"Contenu du dossier : {CORPUS_DIR}")
    # Utiliser !ls pour lister le contenu
    !ls -lh {CORPUS_DIR}
    print("-" * 30)

    print("\n--- Vérification du format de ap_docs.jsonl ---")
    jsonl_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
    if not os.path.exists(jsonl_path):
        print(f"ERREUR: Le fichier {jsonl_path} n'existe pas. L'étape d'extraction a échoué.")
    else:
        print(f"Affichage des 3 premières lignes de : {jsonl_path}")
        # Utiliser !head pour afficher les premières lignes
        !head -n 3 {jsonl_path}
        print("-" * 30)

    print("\n--- Vérification et Nettoyage potentiel ---")
    preproc_jsonl_path = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")
    if os.path.exists(preproc_jsonl_path):
        print(f"Le fichier prétraité {preproc_jsonl_path} existe.")
        print("Il va être supprimé pour éviter les interférences avec l'indexation baseline.")
        try:
            # Utiliser !rm pour supprimer le fichier
            rm_cmd = f"rm '{preproc_jsonl_path}'" # Mettre des guillemets au cas où il y aurait des espaces
            print(f"Exécution de : {rm_cmd}")
            subprocess.run(rm_cmd, shell=True, check=True, capture_output=True, text=True)
            print(f"Fichier {preproc_jsonl_path} supprimé avec succès.")
            # Vérifier à nouveau le contenu du dossier
            print("\nNouveau contenu du dossier :")
            !ls -lh {CORPUS_DIR}
        except subprocess.CalledProcessError as e:
            print(f"ERREUR lors de la suppression de {preproc_jsonl_path}: {e}")
            print("Sortie STDERR:", e.stderr)
        except Exception as e:
            print(f"ERREUR inattendue lors de la suppression: {e}")
    else:
        print(f"Le fichier prétraité {preproc_jsonl_path} n'existe pas. Aucun nettoyage nécessaire.")
    print("-" * 30)

print("\n--- Vérification et Nettoyage Terminés ---")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire et Formater les Documents depuis AP.tar (Avec Debug) ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression
import os # Assurer que os est importé
import traceback # Pour afficher les erreurs

# Chemins définis précédemment
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")
print("--- AJOUT DE DEBUG ---")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé. Vérifiez le chemin et le nom du fichier.")
else:
    # Afficher la taille du fichier TAR
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")
    if tar_size < 1024 * 1024: # Moins de 1 Mo, suspect pour AP
        print("  ATTENTION: La taille du fichier TAR semble très petite !")


# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
docs_found_in_files = 0
first_doc_id_found = None
first_doc_text_sample = None

# Ouvrir/créer le fichier JSONL de sortie
# Utiliser le mode "r:" pour un fichier .tar non compressé
try:
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\nDEBUG: {len(members)} membres trouvés dans l'archive TAR.")
        if not members:
             print("ATTENTION: Aucun membre trouvé dans l'archive TAR. Le fichier est peut-être vide ou corrompu.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            if not member.isfile():
                skipped_members += 1
                # print(f"DEBUG: Membre ignoré (pas un fichier): {member.name}")
                continue

            file_read_count += 1
            if file_read_count % 50 == 0: # Afficher un message tous les 50 fichiers lus
                 print(f"DEBUG: Lecture du fichier {file_read_count}/{len(members)}: {member.name}")

            try:
                f = tar.extractfile(member)
                if f:
                    content = f.read().decode('utf-8', errors='ignore')
                    f.close()

                    # DEBUG: Vérifier si des balises <DOC> sont trouvées
                    doc_matches = doc_pattern.findall(content)
                    num_docs_in_file = len(doc_matches)
                    if num_docs_in_file > 0:
                        docs_found_in_files += 1
                        # print(f"DEBUG: Trouvé {num_docs_in_file} <DOC> dans {member.name}")
                    # elif file_read_count <= 10: # Afficher pour les 10 premiers fichiers si aucun doc trouvé
                         # print(f"DEBUG: Trouvé 0 <DOC> dans {member.name}")


                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        # DEBUG: Sauvegarder le premier ID et extrait de texte trouvés
                        if first_doc_id_found is None:
                            first_doc_id_found = doc_id
                            first_doc_text_sample = doc_text[:100] + "..." # Extrait

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Est-il corrompu ou n'est-ce pas un fichier TAR valide? Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé au moment de l'ouverture.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement du fichier TAR: {e_general}")
     traceback.print_exc()
     raise e_general


print(f"\n--- Fin de l'Extraction (Avec Debug) ---")
print(f"  {file_read_count} fichiers lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés.")
print(f"  {docs_found_in_files} fichiers contenaient au moins une balise <DOC>.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")
if first_doc_id_found:
    print(f"  Premier Doc ID trouvé: {first_doc_id_found}")
    print(f"  Extrait du premier texte: {first_doc_text_sample}")
else:
    print("  Aucun document avec ID et Texte n'a été trouvé/extrait.")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait ! Vérifiez les regex ou la structure interne des fichiers dans AP.tar. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire, Décompresser et Formater les Documents ===
import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Chemins définis précédemment
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")

# Regex (inchangées)
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

# Ouvrir/créer le fichier JSONL de sortie
try:
    # Utiliser encoding='utf-8' pour l'écriture
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            if not member.isfile() or not member.name.endswith(('.gz', '.Z')): # Traiter seulement les fichiers .gz ou .Z
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Initialiser content

            try:
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # --- AJOUT : Décompression Gzip ---
                    try:
                        # Décompresser le contenu lu
                        content_bytes = gzip.decompress(compressed_content)
                        # Décoder en texte APRES décompression
                        content = content_bytes.decode('utf-8', errors='ignore')
                    except gzip.BadGzipFile:
                        # print(f"Avertissement: Fichier {member.name} n'est pas un fichier gzip valide, tentative de lecture directe.")
                        # Essayer de décoder directement si ce n'était pas du gzip (moins probable vu les noms)
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au fichier suivant si la décompression échoue
                    # --- FIN AJOUT ---

                    # Chercher les documents dans le contenu décompressé et décodé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches:
                         # Si aucun <DOC> trouvé, passer au membre suivant
                         continue

                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement: {e_general}")
     traceback.print_exc()
     raise e_general

print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés (pas .gz/.Z ou erreur lecture).")
if decompression_errors > 0:
    print(f"  {decompression_errors} erreurs de décompression rencontrées.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait même après tentative de décompression ! Vérifiez les regex ou la structure interne des fichiers décompressés. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")
    elif output_size > 0 and doc_count > 0:
         print("  SUCCÈS: Le fichier de sortie contient des données.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (maintenant non vide)
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction a échoué.")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé malgré un fichier source non vide. Problème potentiel.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")




# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
import json
from tqdm.notebook import tqdm
import os
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (non vide)
# CORPUS_DIR

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# S'assurer que la fonction preprocess_text est définie (normalement fait dans la cellule de setup)
if 'preprocess_text' not in globals():
    print("Erreur: La fonction 'preprocess_text' n'est pas définie. Ré-exécutez la cellule de configuration.")
    raise NameError("preprocess_text non définie")
else:
    doc_count_preproc = 0
    error_count = 0
    # Lire le fichier JSONL original et écrire le fichier prétraité
    try:
        # Utiliser utf-8 pour lire et écrire
        with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
             open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:

            # Itérer sur le fichier d'entrée
            # Utiliser tqdm pour la barre de progression
            for line in tqdm(infile, desc="Prétraitement des documents"):
                try:
                    data = json.loads(line)
                    # Utiliser .get pour la robustesse si 'id' ou 'contents' manque
                    doc_id = data.get('id', None)
                    original_contents = data.get('contents', '')

                    if doc_id is None:
                        error_count += 1
                        continue

                    # Appliquer le prétraitement
                    preprocessed_contents = preprocess_text(original_contents)

                    # Écrire la nouvelle ligne JSONL
                    json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                    outfile.write(json_line + '\n')
                    doc_count_preproc += 1

                except json.JSONDecodeError:
                    error_count += 1
                except Exception as e_line:
                    print(f"\nErreur inattendue lors du prétraitement d'une ligne (id={data.get('id', 'inconnu')}): {e_line}")
                    error_count += 1

        print(f"\nTerminé.")
        print(f"  {doc_count_preproc} documents prétraités et écrits dans {JSONL_PREPROC_PATH}")
        if error_count > 0:
             print(f"  {error_count} lignes ignorées à cause d'erreurs.")

        # Vérifier la taille du fichier de sortie
        if os.path.exists(JSONL_PREPROC_PATH):
            output_size = os.path.getsize(JSONL_PREPROC_PATH)
            print(f"  Taille finale de {JSONL_PREPROC_PATH}: {output_size} octets.")
            if output_size == 0 and doc_count_preproc > 0:
                 print("  ATTENTION: 0 octet écrit malgré le traitement de documents. Problème ?")
        else:
            print(f"  ATTENTION: Le fichier de sortie {JSONL_PREPROC_PATH} n'a pas été créé.")


    except FileNotFoundError:
        print(f"ERREUR: Le fichier d'entrée {JSONL_OUTPUT_PATH} n'a pas été trouvé.")
        raise
    except Exception as e_main:
        print(f"ERREUR générale lors de la préparation des données prétraitées: {e_main}")
        traceback.print_exc()
        raise



# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source
# INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Dossier cible pour l'index
# CORPUS_DIR contient le fichier JSONL prétraité

print(f"Début de l'indexation avec Prétraitement...")
# Note: Pyserini s'attend à un dossier en entrée pour JsonCollection,
# il trouvera ap_docs_preprocessed.jsonl dans CORPUS_DIR.
print(f"Collection source (dossier): {CORPUS_DIR}")
JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Chemin complet pour vérification
print(f"Fichier JSONL prétraité attendu: {JSONL_PREPROC_PATH}")
print(f"Répertoire de l'index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe et n'est pas vide
if not os.path.exists(JSONL_PREPROC_PATH) or os.path.getsize(JSONL_PREPROC_PATH) == 0:
    raise FileNotFoundError(f"Le fichier de données prétraitées {JSONL_PREPROC_PATH} est manquant ou vide. L'étape précédente (1.3) a échoué.")

# Commande Pyserini pour l'indexation prétraitée
index_cmd_preproc = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR, # Pointeur vers le dossier contenant les jsonl
    "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Important: Indique que le texte est déjà tokenisé/traité
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_preproc)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion
    result = subprocess.run(index_cmd_preproc, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé. Problème potentiel avec l'indexation prétraitée.")
    else:
        print(f"\nIndexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Prétraitée a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Prétraitée a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Prétraitée: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_PREPROC}...")
if os.path.exists(INDEX_DIR_PREPROC):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_PREPROC}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

# === Cellule 3.1: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Utilise la dernière Pyserini et Java 21
# Assurez-vous que les variables d'index et de requêtes sont définies par la cellule de config
# INDEX_DIR_BASELINE, INDEX_DIR_PREPROC
# queries_short, queries_long, queries_short_preprocessed, queries_long_preprocessed
# K_RESULTS devrait aussi être défini (sinon, on le mettra à 1000)

from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées
import os # Assurer que os est importé
from jnius import autoclass, JavaException # Importer pour TF-IDF

# Essayer de définir K_RESULTS si ce n'est pas déjà fait
try:
    K_RESULTS
except NameError:
    print("Définition de K_RESULTS (nombre de résultats) à 1000...")
    K_RESULTS = 1000

# --- Configuration des modèles de similarité ---
# Charger la classe Java pour TF-IDF (ClassicSimilarity)
# Mettre dans un try-except au cas où l'import échouerait (peu probable avec Java 21)
try:
    ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
    print("Classe ClassicSimilarity (pour TF-IDF) chargée avec succès.")
except JavaException as e_load_class:
    print(f"ERREUR Java lors du chargement de ClassicSimilarity: {e_load_class}")
    print("Les recherches TF-IDF échoueront probablement.")
    ClassicSimilarity = None # Mettre à None pour pouvoir vérifier plus tard
except Exception as e_load_gen:
     print(f"ERREUR inattendue lors du chargement de ClassicSimilarity: {e_load_gen}")
     ClassicSimilarity = None


def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        # Assurer que LuceneSearcher est importé
        from pyserini.search.lucene import LuceneSearcher
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None:
                 print("ERREUR: Classe ClassicSimilarity non chargée. Impossible de configurer TF-IDF.")
                 print(f"--- ABANDON du run {run_tag} ---")
                 return # Ne pas continuer si la classe n'a pas pu être chargée

            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return
        else:
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # S'assurer que preprocess_text est défini
                if 'preprocess_text' not in globals():
                     raise NameError("La fonction preprocess_text n'est pas définie.")

                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                # Vérifier si la requête traitée est vide
                if not search_text.strip():
                     # print(f"  Avertissement: Requête QID {query_id} est vide après traitement, ignorée.")
                     continue # Ignorer les requêtes vides

                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    # S'assurer que doc_id n'est pas None (peut arriver dans de rares cas)
                    if doc_id is None:
                        # print(f"  Avertissement: Doc ID est None pour QID {query_id} au rang {rank}, ignoré.")
                        continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                # Compter les erreurs par requête mais continuer
                query_errors += 1
                if query_errors < 10: # Limiter l'affichage des erreurs par requête
                     print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                elif query_errors == 10:
                     print("\nPlusieurs erreurs de recherche pour ce run, messages suivants masqués...")


        # Écrire les résultats dans le fichier de run TREC
        if all_results_list:
             # Utiliser encoding='utf-8' pour l'écriture
             with open(output_run_file, 'w', encoding='utf-8') as f_out:
                f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes de résultats écrites.")
        else:
            print("\n  Avertissement: Aucun résultat généré pour ce run.")

        if query_errors > 0:
            print(f"  Avertissement: {query_errors} erreurs rencontrées lors de la recherche sur les requêtes individuelles.")

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc())
    finally:
        # En théorie, Pyserini/jnius gère la fermeture de la JVM, pas besoin de fermer le searcher explicitement
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des 8 configurations de recherche (Séquentiel) ---

print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Tout-en-un) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Essayer de trouver la ressource pour éviter le re-téléchargement inutile
        if resource == 'punkt':
            nltk.data.find(f'tokenizers/{resource}.zip')
        elif resource == 'omw-1.4':
             nltk.data.find(f'corpora/{resource}.zip')
        else:
            nltk.data.find(f'corpora/{resource}.zip')
        # print(f"  Ressource NLTK '{resource}' déjà présente.")
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")


# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire, Décompresser et Formater les Documents ===
import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Chemins définis dans la cellule précédente (full_setup_code)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")

# Regex (inchangées)
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

# Ouvrir/créer le fichier JSONL de sortie
try:
    # Utiliser encoding='utf-8' pour l'écriture
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Traiter seulement les fichiers se terminant par .gz ou .Z (typique pour TREC)
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Initialiser content

            try:
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # --- AJOUT : Décompression Gzip ---
                    try:
                        # Décompresser le contenu lu
                        content_bytes = gzip.decompress(compressed_content)
                        # Décoder en texte APRES décompression
                        content = content_bytes.decode('utf-8', errors='ignore')
                    except gzip.BadGzipFile:
                        # print(f"Avertissement: Fichier {member.name} n'est pas un fichier gzip valide, tentative de lecture directe.")
                        # Essayer de décoder directement si ce n'était pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au fichier suivant si la décompression échoue
                    # --- FIN AJOUT ---

                    # Chercher les documents dans le contenu décompressé et décodé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches:
                         # Si aucun <DOC> trouvé, passer au membre suivant
                         continue

                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement: {e_general}")
     traceback.print_exc()
     raise e_general

print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0:
    print(f"  {decompression_errors} erreurs ou avertissements de décompression rencontrés.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait ! Vérifiez les regex ou la structure interne des fichiers décompressés. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")
    elif output_size > 0 and doc_count > 0:
         print("  SUCCÈS: Le fichier de sortie contient des données.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (maintenant non vide)
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction a échoué.")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé malgré un fichier source non vide. Problème potentiel.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

    # === Sauvegarde des fichiers générés vers Google Drive ===
    import os
    import subprocess

    # Redéfinir le chemin de base sur Drive (adaptez si nécessaire)
    # Assurez-vous que ce chemin pointe vers le dossier où vous voulez sauvegarder,
    # par exemple, le dossier Projet_RI
    # DRIVE_SAVE_BASE_PATH = "/content/drive/My Drive/Projet_RI" # Exemple
    # Ou utiliser le chemin du projet TREC si vous voulez sauvegarder dedans
    DRIVE_SAVE_BASE_PATH = DRIVE_PROJECT_PATH # Sauvegarde dans le dossier TREC

    # Chemin source dans Colab
    SOURCE_DIR = "/content/ap_output"

    # Chemin cible sur Google Drive
    # Crée un sous-dossier 'colab_output_backup' pour ne pas mélanger
    # avec vos fichiers originaux.
    TARGET_DIR_ON_DRIVE = os.path.join(DRIVE_SAVE_BASE_PATH, "colab_output_backup")

    print(f"Source à copier : {SOURCE_DIR}")
    print(f"Cible sur Drive : {TARGET_DIR_ON_DRIVE}")

    # Vérifier si le dossier source existe
    if os.path.exists(SOURCE_DIR):
        # Créer le dossier cible sur Drive s'il n'existe pas
        os.makedirs(TARGET_DIR_ON_DRIVE, exist_ok=True)
        print("\nCopie des fichiers en cours... (Cela peut prendre quelques minutes)")
        # Utiliser cp -r (récursif) et -v (verbeux)
        copy_cmd = f"cp -r -v '{SOURCE_DIR}/.' '{TARGET_DIR_ON_DRIVE}/'" # Copie le contenu de SOURCE_DIR
        try:
            # Utiliser subprocess pour voir la sortie en temps réel (peut être long)
            # Ou simplement !cp -r ...
            process = subprocess.Popen(copy_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = process.communicate()
            if process.returncode == 0:
                print("\nCopie terminée avec succès !")
                print(f"Les fichiers de {SOURCE_DIR} ont été copiés dans {TARGET_DIR_ON_DRIVE}")
            else:
                print(f"\nERREUR lors de la copie. Code de retour: {process.returncode}")
                print("STDOUT:", stdout.decode())
                print("STDERR:", stderr.decode())
        except Exception as e:
            print(f"\nERREUR inattendue lors de la copie: {e}")
    else:
        print(f"Le dossier source {SOURCE_DIR} n'existe pas, aucune copie effectuée.")



# --- Nouvelle Cellule ---

    # === Restauration des fichiers depuis Google Drive ===
    import os
    import subprocess
    import time

    # Chemin où les fichiers ont été sauvegardés sur Drive
    # (Doit correspondre au TARGET_DIR_ON_DRIVE de la cellule save_output_code)
    # Assurez-vous que DRIVE_PROJECT_PATH est défini par la cellule de setup précédente
    try:
        DRIVE_PROJECT_PATH
    except NameError:
        print("ERREUR: La variable DRIVE_PROJECT_PATH n'est pas définie. Exécutez d'abord la cellule de configuration complète.")
        # Optionnel: Redéfinir ici si nécessaire, mais il vaut mieux exécuter la cellule de setup
        # DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC"
        raise

    DRIVE_BACKUP_DIR = os.path.join(DRIVE_PROJECT_PATH, "colab_output_backup")

    # Chemin cible dans Colab (où Pyserini s'attend à les trouver)
    TARGET_RESTORE_DIR = "/content/ap_output"

    print(f"Source sur Drive : {DRIVE_BACKUP_DIR}")
    print(f"Cible dans Colab : {TARGET_RESTORE_DIR}")

    # Vérifier si le dossier de sauvegarde existe sur Drive
    if os.path.exists(DRIVE_BACKUP_DIR):
        # Créer le dossier cible dans Colab s'il n'existe pas
        # (La cellule de setup l'a peut-être déjà créé, mais `exist_ok=True` gère cela)
        os.makedirs(TARGET_RESTORE_DIR, exist_ok=True)

        print("\nRestauration des fichiers en cours... (Cela peut prendre quelques minutes)")
        # Utiliser cp -r (récursif) et -v (verbeux)
        # Copie le contenu de DRIVE_BACKUP_DIR dans TARGET_RESTORE_DIR
        # L'option -T peut être utile si TARGET_RESTORE_DIR existe déjà pour éviter de créer un sous-dossier
        # Mais copier le contenu avec '/.' est généralement plus sûr.
        copy_cmd = f"cp -r -v '{DRIVE_BACKUP_DIR}/.' '{TARGET_RESTORE_DIR}/'"
        try:
            # Exécuter et attendre la fin
            process = subprocess.run(copy_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600) # Timeout 10 minutes
            # Afficher stdout/stderr peut être très long, afficher seulement si erreur?
            # print("STDOUT:", process.stdout)
            # print("STDERR:", process.stderr)
            print("\nRestauration terminée avec succès !")
            print(f"Les fichiers de {DRIVE_BACKUP_DIR} ont été copiés dans {TARGET_RESTORE_DIR}")
            # Vérifier le contenu restauré
            print("\nContenu du dossier restauré (partiel):")
            !ls -lR {TARGET_RESTORE_DIR} | head -n 20 # Afficher une partie du contenu
        except subprocess.CalledProcessError as e:
             print(f"\nERREUR lors de la restauration. Code de retour: {e.returncode}")
             print("STDOUT:", e.stdout)
             print("STDERR:", e.stderr)
             print("\nVérifiez que le dossier de sauvegarde existe et contient les bons fichiers/dossiers (corpus, indexes/baseline).")
             raise e
        except subprocess.TimeoutExpired as e:
            print(f"\nERREUR: La restauration a dépassé le délai d'attente.")
            raise e
        except Exception as e:
            print(f"\nERREUR inattendue lors de la restauration: {e}")
            raise e
    else:
        print(f"ERREUR: Le dossier de sauvegarde {DRIVE_BACKUP_DIR} n'existe pas sur Google Drive.")
        print("Impossible de restaurer les fichiers. Vous devrez relancer les étapes d'extraction et d'indexation baseline.")
        # Optionnel: lever une exception pour arrêter
        # raise FileNotFoundError(f"Dossier de sauvegarde non trouvé: {DRIVE_BACKUP_DIR}")



# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Pour Reprendre) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Essayer de trouver la ressource pour éviter le re-téléchargement inutile
        if resource == 'punkt':
            nltk.data.find(f'tokenizers/{resource}.zip')
        elif resource == 'omw-1.4':
             nltk.data.find(f'corpora/{resource}.zip')
        else:
            nltk.data.find(f'corpora/{resource}.zip')
        # print(f"  Ressource NLTK '{resource}' déjà présente.")
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Correction NLTK) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
# S'assurer que nltk est importé
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Déterminer le chemin de recherche correct pour nltk.data.find
        if resource == 'punkt':
            resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4':
             resource_path = f'corpora/{resource}.zip' # Open Multilingual Wordnet
        elif resource == 'wordnet':
             resource_path = f'corpora/{resource}.zip'
        else: # stopwords, etc.
            resource_path = f'corpora/{resource}.zip'

        # Essayer de trouver la ressource
        nltk.data.find(resource_path)
        # print(f"  Ressource NLTK '{resource}' déjà présente.")

    # --- CORRECTION ICI: Utiliser except LookupError ---
    except LookupError:
    # --------------------------------------------------
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try:
            nltk.download(resource, quiet=True)
            print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download:
            # Capturer les erreurs potentielles de téléchargement (réseau, etc.)
            print(f"  ERREUR lors du téléchargement de '{resource}': {e_download}")
            # Optionnel: arrêter si une ressource critique manque
            # if resource in ['punkt', 'stopwords', 'wordnet']: raise

print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
# (Déjà fait plus haut, mais redondance sans danger)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Correction NLTK punkt_tab) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
# S'assurer que nltk est importé
import nltk
# --- CORRECTION ICI: Ajout de 'punkt_tab' ---
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab']
# ---------------------------------------------
for resource in nltk_resources:
    try:
        # Déterminer le chemin de recherche correct pour nltk.data.find
        if resource == 'punkt' or resource == 'punkt_tab': # punkt_tab est aussi dans tokenizers
            resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4':
             resource_path = f'corpora/{resource}.zip' # Open Multilingual Wordnet
        elif resource == 'wordnet':
             resource_path = f'corpora/{resource}.zip'
        else: # stopwords, etc.
            resource_path = f'corpora/{resource}.zip'

        # Essayer de trouver la ressource
        nltk.data.find(resource_path)
        # print(f"  Ressource NLTK '{resource}' déjà présente.")

    except LookupError:
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try:
            nltk.download(resource, quiet=True)
            print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download:
            # Capturer les erreurs potentielles de téléchargement (réseau, etc.)
            print(f"  ERREUR lors du téléchargement de '{resource}': {e_download}")
            # Optionnel: arrêter si une ressource critique manque
            # if resource in ['punkt', 'stopwords', 'wordnet']: raise

print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
# (Déjà fait plus haut, mais redondance sans danger)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    # Mettre la tokenisation dans un try-except spécifique pour voir si c'est elle qui échoue
    try:
        tokens = word_tokenize(text.lower())
    except LookupError as e_tok:
         # Essayer de télécharger la ressource manquante si c'est une LookupError NLTK
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]
              print(f"--- Tokenizer a besoin de '{resource_name}', tentative de téléchargement ---")
              try:
                  nltk.download(resource_name, quiet=True)
                  print(f"--- Ressource '{resource_name}' téléchargée, nouvelle tentative de tokenisation ---")
                  tokens = word_tokenize(text.lower()) # Retenter après téléchargement
              except Exception as e_dl_tok:
                  print(f"--- Échec du téléchargement de '{resource_name}': {e_dl_tok} ---")
                  raise e_tok # Relancer l'erreur originale si le téléchargement échoue
         else:
              raise e_tok # Relancer si ce n'est pas une ressource manquante connue
    except Exception as e_tok_other:
         print(f"Erreur inattendue dans word_tokenize: {e_tok_other}")
         raise e_tok_other

    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
# Mettre la création des dictionnaires prétraités dans un try-except au cas où preprocess_text échouerait encore
try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés.")
    print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes...")
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries:
     print(f"\nERREUR lors du prétraitement des requêtes: {e_preproc_queries}")
     print("Les dictionnaires prétraités pourraient être incomplets ou vides.")
     # Créer des dictionnaires vides pour éviter NameError plus tard
     queries_short_preprocessed = {}
     queries_long_preprocessed = {}


# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire, Décompresser et Formater les Documents ===
import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Chemins définis dans la cellule précédente (full_setup_code_punkt_tab_fixed)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")

# Regex (inchangées)
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

# Ouvrir/créer le fichier JSONL de sortie
try:
    # Utiliser encoding='utf-8' pour l'écriture
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Traiter seulement les fichiers se terminant par .gz ou .Z (typique pour TREC)
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Initialiser content

            try:
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # --- AJOUT : Décompression Gzip ---
                    try:
                        # Décompresser le contenu lu
                        content_bytes = gzip.decompress(compressed_content)
                        # Décoder en texte APRES décompression
                        content = content_bytes.decode('utf-8', errors='ignore')
                    except gzip.BadGzipFile:
                        # print(f"Avertissement: Fichier {member.name} n'est pas un fichier gzip valide, tentative de lecture directe.")
                        # Essayer de décoder directement si ce n'était pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au fichier suivant si la décompression échoue
                    # --- FIN AJOUT ---

                    # Chercher les documents dans le contenu décompressé et décodé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches:
                         # Si aucun <DOC> trouvé, passer au membre suivant
                         continue

                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement: {e_general}")
     traceback.print_exc()
     raise e_general

print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0:
    print(f"  {decompression_errors} erreurs ou avertissements de décompression rencontrés.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait ! Vérifiez les regex ou la structure interne des fichiers décompressés. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")
    elif output_size > 0 and doc_count > 0:
         print("  SUCCÈS: Le fichier de sortie contient des données.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (maintenant non vide)
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

# S'assurer que les variables sont définies (au cas où)
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Les variables CORPUS_DIR ou INDEX_DIR_BASELINE ne sont pas définies. Ré-exécutez la cellule de configuration.")
    # Optionnel: redéfinir ici, mais moins propre
    # OUTPUT_DIR = "/content/ap_output"
    # CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
    # INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
    raise

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction ('extract_code_tar_gzip_fixed') a peut-être échoué ou n'a pas été exécutée.")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé malgré un fichier source non vide. Problème potentiel.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
import json
from tqdm.notebook import tqdm
import os
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (non vide)
# CORPUS_DIR

# S'assurer que les variables sont définies
try:
    CORPUS_DIR
    JSONL_OUTPUT_PATH
except NameError:
    print("ERREUR: Les variables CORPUS_DIR ou JSONL_OUTPUT_PATH ne sont pas définies. Ré-exécutez la cellule de configuration.")
    raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# S'assurer que la fonction preprocess_text est définie (normalement fait dans la cellule de setup)
if 'preprocess_text' not in globals():
    print("Erreur: La fonction 'preprocess_text' n'est pas définie. Ré-exécutez la cellule de configuration.")
    raise NameError("preprocess_text non définie")
else:
    doc_count_preproc = 0
    error_count = 0
    # Lire le fichier JSONL original et écrire le fichier prétraité
    try:
        # Utiliser utf-8 pour lire et écrire
        with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
             open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:

            # Itérer sur le fichier d'entrée
            # Utiliser tqdm pour la barre de progression
            for line in tqdm(infile, desc="Prétraitement des documents"):
                try:
                    data = json.loads(line)
                    # Utiliser .get pour la robustesse si 'id' ou 'contents' manque
                    doc_id = data.get('id', None)
                    original_contents = data.get('contents', '')

                    if doc_id is None:
                        error_count += 1
                        continue

                    # Appliquer le prétraitement
                    preprocessed_contents = preprocess_text(original_contents)

                    # Écrire la nouvelle ligne JSONL
                    json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                    outfile.write(json_line + '\n')
                    doc_count_preproc += 1

                except json.JSONDecodeError:
                    # print(f"Avertissement: Erreur de décodage JSON sur une ligne, ignorée.")
                    error_count += 1
                except Exception as e_line:
                    print(f"\nErreur inattendue lors du prétraitement d'une ligne (id={data.get('id', 'inconnu')}): {e_line}")
                    error_count += 1

        print(f"\nTerminé.")
        print(f"  {doc_count_preproc} documents prétraités et écrits dans {JSONL_PREPROC_PATH}")
        if error_count > 0:
             print(f"  {error_count} lignes ignorées à cause d'erreurs.")

        # Vérifier la taille du fichier de sortie
        if os.path.exists(JSONL_PREPROC_PATH):
            output_size = os.path.getsize(JSONL_PREPROC_PATH)
            print(f"  Taille finale de {JSONL_PREPROC_PATH}: {output_size} octets.")
            if output_size == 0 and doc_count_preproc > 0:
                 print("  ATTENTION: 0 octet écrit malgré le traitement de documents. Problème ?")
        else:
            print(f"  ATTENTION: Le fichier de sortie {JSONL_PREPROC_PATH} n'a pas été créé.")


    except FileNotFoundError:
        print(f"ERREUR: Le fichier d'entrée {JSONL_OUTPUT_PATH} n'a pas été trouvé.")
        raise
    except Exception as e_main:
        print(f"ERREUR générale lors de la préparation des données prétraitées: {e_main}")
        traceback.print_exc()
        raise



# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source
# INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Dossier cible pour l'index
# CORPUS_DIR contient le fichier JSONL prétraité

# S'assurer que les variables sont définies
try:
    CORPUS_DIR
    INDEX_DIR_PREPROC
except NameError:
    print("ERREUR: Les variables CORPUS_DIR ou INDEX_DIR_PREPROC ne sont pas définies. Ré-exécutez la cellule de configuration.")
    # Optionnel: redéfinir ici, mais moins propre
    # OUTPUT_DIR = "/content/ap_output"
    # CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
    # INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
    raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Chemin complet pour vérification

print(f"Début de l'indexation avec Prétraitement...")
# Note: Pyserini s'attend à un dossier en entrée pour JsonCollection,
# il trouvera ap_docs_preprocessed.jsonl dans CORPUS_DIR.
print(f"Collection source (dossier): {CORPUS_DIR}")
print(f"Fichier JSONL prétraité attendu: {JSONL_PREPROC_PATH}")
print(f"Répertoire de l'index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe et n'est pas vide
if not os.path.exists(JSONL_PREPROC_PATH) or os.path.getsize(JSONL_PREPROC_PATH) == 0:
    raise FileNotFoundError(f"Le fichier de données prétraitées {JSONL_PREPROC_PATH} est manquant ou vide. Assurez-vous que l'étape précédente (1.3) s'est bien terminée.")

# Commande Pyserini pour l'indexation prétraitée
index_cmd_preproc = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR, # Pointeur vers le dossier contenant les jsonl
    "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Important: Indique que le texte est déjà tokenisé/traité
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_preproc)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion
    result = subprocess.run(index_cmd_preproc, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé. Problème potentiel avec l'indexation prétraitée.")
    else:
        print(f"\nIndexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Prétraitée a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Prétraitée a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Prétraitée: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_PREPROC}...")
if os.path.exists(INDEX_DIR_PREPROC):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_PREPROC}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

# === Cellule 3.1: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Utilise la dernière Pyserini et Java 21 (devraient être actifs)
# S'assurer que les variables d'index et de requêtes sont définies

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées
import os # Assurer que os est importé
from jnius import autoclass, JavaException # Importer pour TF-IDF

# Essayer de définir K_RESULTS si ce n'est pas déjà fait
try:
    K_RESULTS
except NameError:
    print("Définition de K_RESULTS (nombre de résultats) à 1000...")
    K_RESULTS = 1000

# --- Configuration des modèles de similarité ---
# Charger la classe Java pour TF-IDF (ClassicSimilarity)
# Mettre dans un try-except au cas où l'import échouerait (peu probable maintenant)
try:
    ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
    print("Classe ClassicSimilarity (pour TF-IDF) chargée avec succès.")
except JavaException as e_load_class:
    print(f"ERREUR Java lors du chargement de ClassicSimilarity: {e_load_class}")
    print("Les recherches TF-IDF échoueront probablement.")
    ClassicSimilarity = None # Mettre à None pour pouvoir vérifier plus tard
except Exception as e_load_gen:
     print(f"ERREUR inattendue lors du chargement de ClassicSimilarity: {e_load_gen}")
     ClassicSimilarity = None

# Vérifier que les variables nécessaires existent
try:
    INDEX_DIR_BASELINE
    INDEX_DIR_PREPROC
    RUN_DIR
    queries_short
    queries_long
    queries_short_preprocessed
    queries_long_preprocessed
    preprocess_text # Vérifier aussi la fonction
except NameError as e_missing_var:
    print(f"ERREUR: Variable essentielle manquante ({e_missing_var}). L'environnement a peut-être été perdu. Ré-exécutez la cellule de configuration complète.")
    raise e_missing_var


def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        # Assurer que LuceneSearcher est importé
        from pyserini.search.lucene import LuceneSearcher
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None:
                 print("ERREUR: Classe ClassicSimilarity non chargée. Impossible de configurer TF-IDF.")
                 print(f"--- ABANDON du run {run_tag} ---")
                 return # Ne pas continuer si la classe n'a pas pu être chargée

            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return
        else:
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # S'assurer que preprocess_text est défini
                if 'preprocess_text' not in globals():
                     raise NameError("La fonction preprocess_text n'est pas définie.")

                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                # Vérifier si la requête traitée est vide
                if not search_text.strip():
                     # print(f"  Avertissement: Requête QID {query_id} est vide après traitement, ignorée.")
                     continue # Ignorer les requêtes vides

                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    # S'assurer que doc_id n'est pas None (peut arriver dans de rares cas)
                    if doc_id is None:
                        # print(f"  Avertissement: Doc ID est None pour QID {query_id} au rang {rank}, ignoré.")
                        continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                # Compter les erreurs par requête mais continuer
                query_errors += 1
                if query_errors < 10: # Limiter l'affichage des erreurs par requête
                     print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                elif query_errors == 10:
                     print("\nPlusieurs erreurs de recherche pour ce run, messages suivants masqués...")


        # Écrire les résultats dans le fichier de run TREC
        if all_results_list:
             # Utiliser encoding='utf-8' pour l'écriture
             with open(output_run_file, 'w', encoding='utf-8') as f_out:
                f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes de résultats écrites.")
        else:
            print("\n  Avertissement: Aucun résultat généré pour ce run.")

        if query_errors > 0:
            print(f"  Avertissement: {query_errors} erreurs rencontrées lors de la recherche sur les requêtes individuelles.")

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc())
    finally:
        # En théorie, Pyserini/jnius gère la fermeture de la JVM, pas besoin de fermer le searcher explicitement
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des 8 configurations de recherche (Séquentiel) ---

print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")


# --- Nouvelle Cellule ---

# === Cellule 7: Exécuter la Recherche Améliorée (RM3) ===
# Applique RM3 sur la meilleure configuration de base identifiée à l'étape 6.
# !! N'OUBLIEZ PAS DE CONFIGURER LES VARIABLES BEST_... CI-DESSOUS !!

from pyserini.search.lucene import LuceneSearcher
from jnius import autoclass, JavaException
from tqdm.notebook import tqdm
import time
import traceback
import os

# Recharger ClassicSimilarity au cas où
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
except Exception: ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; EVAL_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise

# --- À CONFIGURER selon vos meilleurs résultats de l'Étape 6 ---
# !! MODIFIEZ CECI EN FONCTION DE VOS RÉSULTATS D'ÉVALUATION !!
print("--- Configuration RM3 ---")
print("Veuillez éditer les variables BEST_... ci-dessous en fonction de vos meilleurs résultats MAP de l'étape précédente.")
# Exemple: si preproc + long + bm25 était le meilleur
BEST_INDEX_PATH = INDEX_DIR_PREPROC           # Ex: INDEX_DIR_BASELINE ou INDEX_DIR_PREPROC
BEST_QUERIES = queries_long_preprocessed      # Ex: queries_short, queries_long, ..._preprocessed
BEST_MODEL_BASE = 'bm25'                      # Ex: 'bm25' ou 'tfidf'
BEST_RUN_TAG_PREFIX = "preproc_long"          # Ex: 'baseline_short', 'preproc_long'
USE_PREPROC_QUERY_FOR_RM3 = False             # Généralement False si BEST_QUERIES est déjà prétraité
# ----------------------------------------------------------------
print(f"Configuration choisie pour RM3:")
print(f"  Index: {os.path.basename(BEST_INDEX_PATH)}")
# print(f"  Requêtes: (variable BEST_QUERIES)") # Difficile d'afficher le nom de la variable
print(f"  Modèle Base: {BEST_MODEL_BASE}")
print(f"  Préfixe Tag: {BEST_RUN_TAG_PREFIX}")
print(f"  Utiliser Preproc Requête?: {USE_PREPROC_QUERY_FOR_RM3}")

# Nom du fichier et tag pour le run RM3
PRF_RUN_FILE = os.path.join(RUN_DIR, f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3.txt")
RM3_RUN_TAG = f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3"

# Paramètres RM3
rm3_config = {'fb_terms': 10, 'fb_docs': 10, 'original_query_weight': 0.5}
print(f"  Paramètres RM3: {rm3_config}")

# --- Fonction de recherche RM3 (séquentielle) ---
# (Définition identique à celle de search_code_final, on peut la réutiliser si elle est dans la portée)
# Par sécurité, on la redéfinit ici au cas où l'utilisateur n'exécute que cette cellule après setup.
def perform_search_sequential_rm3(queries, index_path, model_base, k, output_run_file, run_tag, use_preprocessed_query=False, rm3_params=None):
    """Exécute la recherche RM3 séquentiellement."""
    start_time = time.time()
    print(f"\nDébut recherche SÉQUENTIELLE RM3: Modèle='{model_base}+RM3', Tag='{run_tag}', k={k}")
    all_results_list = []
    searcher = None
    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model_base == 'bm25': print("  Config BM25 (base)..."); searcher.set_bm25(k1=0.9, b=0.4)
        elif model_base == 'tfidf':
            if ClassicSimilarity is None: raise ValueError("ClassicSimilarity non chargée.")
            print("  Config ClassicSimilarity (base)...")
            try: searcher.set_similarity(ClassicSimilarity())
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle base '{model_base}' non reconnu, utilise BM25."); searcher.set_bm25()
        print("  Activation RM3..."); searcher.set_rm3(**rm3_params); print("  RM3 activé.")
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue
                hits = searcher.search(search_text, k=k)
                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche RM3 QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche RM3...")
        if all_results_list:
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites.")
        else: print("\n  Avertissement: Aucun résultat RM3 généré.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")
        end_time = time.time()
        print(f"Recherche RM3 terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run RM3 {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# Lancer la recherche RM3 (après configuration des variables BEST_...)
print("\nLancement de la recherche RM3...")
perform_search_sequential_rm3(
    BEST_QUERIES, BEST_INDEX_PATH, BEST_MODEL_BASE, K_RESULTS,
    PRF_RUN_FILE, RM3_RUN_TAG,
    use_preprocessed_query=USE_PREPROC_QUERY_FOR_RM3, rm3_params=rm3_config
)

print("\n--- Exécution de la recherche RM3 terminée. ---")


# --- Nouvelle Cellule ---

# === Cellule 5: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Lance les 8 combinaisons de recherche et sauvegarde les résultats.
# Assurez-vous que l'environnement Java 21 est toujours actif.
# Assurez-vous que les index existent et que les variables sont définies.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import autoclass, JavaException # Pour TF-IDF

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Charger ClassicSimilarity
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity'); print("ClassicSimilarity chargée.")
except Exception as e: print(f"ERREUR chargement ClassicSimilarity: {e}"); ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    # Vérifier aussi l'existence des index
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed manquant: {INDEX_DIR_PREPROC}")
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model == 'bm25': print("  Config BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None: print("ERREUR: ClassicSimilarity non chargée. ABANDON."); return
            print("  Config ClassicSimilarity (TF-IDF)...")
            try: searcher.set_similarity(ClassicSimilarity()); print("  ClassicSimilarity configurée.")
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle '{model}' non reconnu, utilise BM25."); searcher.set_bm25()

        query_errors = 0
        # S'assurer que preprocess_text est défini avant la boucle
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # Créer le dossier RUN_DIR si besoin (normalement fait par setup)
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations ---
print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}


# --- Nouvelle Cellule ---

# === Cellule 5: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Lance les 8 combinaisons de recherche et sauvegarde les résultats.
# Assurez-vous que l'environnement Java 21 est toujours actif.
# Assurez-vous que les index existent et que les variables sont définies.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import autoclass, JavaException # Pour TF-IDF

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Charger ClassicSimilarity
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity'); print("ClassicSimilarity chargée.")
except Exception as e: print(f"ERREUR chargement ClassicSimilarity: {e}"); ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    # Vérifier aussi l'existence des index
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed manquant: {INDEX_DIR_PREPROC}")
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model == 'bm25': print("  Config BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None: print("ERREUR: ClassicSimilarity non chargée. ABANDON."); return
            print("  Config ClassicSimilarity (TF-IDF)...")
            try: searcher.set_similarity(ClassicSimilarity()); print("  ClassicSimilarity configurée.")
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle '{model}' non reconnu, utilise BM25."); searcher.set_bm25()

        query_errors = 0
        # S'assurer que preprocess_text est défini avant la boucle
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # Créer le dossier RUN_DIR si besoin (normalement fait par setup)
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations ---
print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}


# --- Nouvelle Cellule ---

OUTPUT_DIR = "/content/ap_output"
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")

# --- Nouvelle Cellule ---

# Chemin exact où les résultats de recherche sont attendus
RUN_DIR_PATH="/content/ap_output/runs/"

echo "Vérification du contenu de : ${RUN_DIR_PATH}"
ls -l ${RUN_DIR_PATH}


# --- Nouvelle Cellule ---

# === Cellule de Vérification du Contenu du Dossier Runs ===
# Utilise les commandes shell de Colab préfixées par '!'

# Chemin exact où les résultats de recherche sont attendus
# (Défini dans la cellule de configuration complète)
RUN_DIR_PATH="/content/ap_output/runs/"

# Utiliser '!' pour exécuter la commande shell 'echo'
print(f"Vérification du contenu de : {RUN_DIR_PATH}")

# Utiliser '!' pour exécuter la commande shell 'ls -l'
# Mettre le chemin entre guillemets pour gérer les espaces potentiels (même s'il n'y en a pas ici)
!ls -lh "{RUN_DIR_PATH}"


# --- Nouvelle Cellule ---

# === Cellule 2: Restauration des fichiers depuis Google Drive (Tout Inclus) ===
import os
import subprocess
import time

# Chemin où les fichiers ont été sauvegardés sur Drive
try: DRIVE_PROJECT_PATH # Défini dans la cellule précédente
except NameError: print("ERREUR: DRIVE_PROJECT_PATH non défini. Exécutez config complète."); raise

DRIVE_BACKUP_DIR = os.path.join(DRIVE_PROJECT_PATH, "colab_output_backup")

# Chemin cible dans Colab
TARGET_RESTORE_DIR = "/content/ap_output" # = OUTPUT_DIR défini précédemment

print(f"Source sur Drive : {DRIVE_BACKUP_DIR}")
print(f"Cible dans Colab : {TARGET_RESTORE_DIR}")

# Vérifier si le dossier de sauvegarde existe
if os.path.exists(DRIVE_BACKUP_DIR):
    os.makedirs(TARGET_RESTORE_DIR, exist_ok=True) # Créer dossier cible si besoin

    print("\nRestauration des fichiers (corpus et index) en cours... (Peut prendre plusieurs minutes)")
    # Commande de copie récursive
    copy_cmd = f"cp -r -v '{DRIVE_BACKUP_DIR}/.' '{TARGET_RESTORE_DIR}/'"
    try:
        process = subprocess.run(copy_cmd, shell=True, check=True, capture_output=True, text=True, timeout=900) # Timeout 15 minutes pour les index
        print("\nRestauration terminée avec succès !")
        print(f"Les fichiers de {DRIVE_BACKUP_DIR} ont été copiés dans {TARGET_RESTORE_DIR}")
        # Vérifier le contenu restauré (y compris les index)
        print("\nContenu du dossier restauré (partiel):")
        !ls -l {TARGET_RESTORE_DIR}
        print("\nContenu du dossier indexes (restauré):")
        !ls -l {TARGET_RESTORE_DIR}/indexes
    except subprocess.CalledProcessError as e:
         print(f"\nERREUR restauration (code {e.returncode}). Vérifiez si backup existe et contient corpus/, indexes/baseline/, indexes/preprocessed/.")
         print("STDERR:", e.stderr); raise e
    except Exception as e: print(f"\nERREUR restauration: {e}"); raise e
else:
    print(f"ERREUR: Dossier sauvegarde {DRIVE_BACKUP_DIR} inexistant.")
    print("Impossible de restaurer. Il faut relancer extraction et indexations.")
    raise FileNotFoundError(f"Dossier sauvegarde non trouvé: {DRIVE_BACKUP_DIR}")



# --- Nouvelle Cellule ---

# === Cellule 4: Exécuter les Recherches (Séquentielles - BM25 & QLD) ===
# Lance les 8 combinaisons de recherche en utilisant BM25 et QLD.
# S'assure que l'environnement Java 21 est actif et que les index/variables sont définis/restaurés.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import JavaException # Importer seulement JavaException, ClassicSimilarity n'est pas utilisé

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index restaurés
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline restauré manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed restauré manquant: {INDEX_DIR_PREPROC}")
    # Vérifier aussi que les fichiers de corpus sont là (restaurés)
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs.jsonl")): raise FileNotFoundError("ap_docs.jsonl manquant après restauration.")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")): raise FileNotFoundError("ap_docs_preprocessed.jsonl manquant après restauration.")

except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential_qld(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes (BM25 ou QLD)."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")

        # Configurer similarité
        if model == 'bm25':
            print("  Configuration BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'qld': # Utiliser Query Likelihood Dirichlet
            print("  Configuration QLD..."); searcher.set_qld(); print("  QLD configuré.")
        else:
            print(f"Modèle '{model}' non reconnu, utilise BM25 par défaut."); searcher.set_bm25()

        # Itérer sur les requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations (BM25 et QLD) ---
print("\n--- DÉBUT DES RECHERCHES BASELINE (BM25/QLD) ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_qld.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_2, "baseline_short") # Utilise qld
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_qld.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_4, "baseline_long") # Utilise qld
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES (BM25/QLD) ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_qld.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False) # Utilise qld
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_qld.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False) # Utilise qld
print("\n--- Toutes les recherches de base (BM25/QLD) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}



# --- Nouvelle Cellule ---

# === Cellule 6: Évaluation des Runs (BM25/QLD) ===
# Lit les fichiers Qrels, lit les fichiers de résultats (.txt) du dossier RUN_DIR,
# calcule MAP et P@10, et affiche/sauvegarde les tableaux récapitulatifs.
# Devrait maintenant évaluer les runs BM25 et QLD.

import pandas as pd
import glob
import pytrec_eval
import os
import traceback

# Vérifier que les chemins sont définis
try:
    QRELS_DIR
    RUN_DIR
    EVAL_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Préparation des Qrels depuis: {QRELS_DIR}")
qrels_files = sorted(glob.glob(os.path.join(QRELS_DIR, "qrels.*.txt")))
if not qrels_files: print(f"ATTENTION: Aucun fichier Qrels trouvé dans {QRELS_DIR}."); qrels_dict = {}
else:
    print(f"Fichiers Qrels trouvés: {qrels_files}")
    all_qrels_data = []
    for qf in qrels_files:
        try:
            # Lire le fichier qrels en spécifiant les types pour éviter les erreurs
            qrels_df = pd.read_csv(qf, sep='\s+', names=['query_id', 'unused', 'doc_id', 'relevance'],
                                   dtype={'query_id': str, 'unused': str, 'doc_id': str, 'relevance': int})
            all_qrels_data.append(qrels_df[['query_id', 'doc_id', 'relevance']])
        except Exception as e: print(f"Erreur lecture Qrels {qf}: {e}")
    if not all_qrels_data: print("ERREUR: Impossible lire données Qrels."); qrels_dict = {}
    else:
        combined_qrels_df = pd.concat(all_qrels_data, ignore_index=True)
        qrels_dict = {}
        # Convertir le DataFrame en dictionnaire attendu par pytrec_eval
        for _, row in combined_qrels_df.iterrows():
            qid, did, rel = str(row['query_id']), str(row['doc_id']), int(row['relevance'])
            if rel < 0: continue # Ignorer jugements négatifs
            if qid not in qrels_dict: qrels_dict[qid] = {}
            qrels_dict[qid][did] = rel
        print(f"Total {len(qrels_dict)} requêtes avec jugements chargées.")

# --- Évaluation des Runs ---
if not qrels_dict: print("\nAucun jugement de pertinence chargé, impossible d'évaluer.")
else:
    measures = {'map', 'P_10'} # Métriques à calculer
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, measures) # Initialiser l'évaluateur
    # Trouver tous les fichiers .txt dans le dossier des runs
    run_files = sorted(glob.glob(os.path.join(RUN_DIR, "*.txt")))
    print(f"\n{len(run_files)} fichiers de run à évaluer trouvés dans {RUN_DIR}.")
    print(f"  Fichiers: {[os.path.basename(f) for f in run_files]}") # Afficher les noms

    results_summary = [] # Liste pour stocker les résultats agrégés
    if not run_files: print(f"ATTENTION: Aucun fichier de run (.txt) trouvé dans {RUN_DIR}.")
    else:
        # Boucler sur chaque fichier de run trouvé
        for run_file in run_files:
            run_name = os.path.basename(run_file)
            print(f"\n--- Évaluation: {run_name} ---")
            run_dict = {} # Dictionnaire pour stocker les résultats de ce run
            error_count = 0
            line_count = 0
            try:
                # Lire le fichier run ligne par ligne
                with open(run_file, 'r', encoding='utf-8') as f_run:
                    for line in f_run:
                        line_count += 1
                        parts = line.strip().split()
                        # Vérifier le format TREC (6 colonnes)
                        if len(parts) != 6: error_count += 1; continue
                        qid, _, did, _, score, _ = parts # Extraire les infos utiles
                        try: score = float(score) # Convertir le score en float
                        except ValueError: error_count += 1; continue
                        qid = str(qid) # Assurer que qid est une chaîne
                        # Stocker le score pour ce document et cette requête
                        if qid not in run_dict: run_dict[qid] = {}
                        run_dict[qid][did] = score
                if error_count > 0: print(f"  Avertissement: {error_count} lignes mal formatées ignorées sur {line_count} lignes.")

                # Filtrer le run pour ne garder que les requêtes présentes dans les Qrels
                filtered_run_dict = {qid: docs for qid, docs in run_dict.items() if qid in qrels_dict}
                ignored_q = len(run_dict) - len(filtered_run_dict)
                if ignored_q > 0: print(f"  Avertissement: {ignored_q} requêtes run ignorées (absentes Qrels).")
                if not filtered_run_dict: print("  Erreur: Aucune requête ne correspond aux Qrels."); continue

                # Évaluer le run filtré avec pytrec_eval
                eval_results = evaluator.evaluate(filtered_run_dict)
                # Calculer les moyennes des métriques sur toutes les requêtes évaluées
                all_maps = [q_res.get("map", 0) for q_res in eval_results.values()]
                all_p10s = [q_res.get("P_10", 0) for q_res in eval_results.values()]
                avg_map = sum(all_maps) / len(all_maps) if all_maps else 0
                avg_p10 = sum(all_p10s) / len(all_p10s) if all_p10s else 0

                # Afficher les résultats moyens pour ce run
                print(f"  MAP: {avg_map:.4f}")
                print(f"  P@10: {avg_p10:.4f}")

                # Extraire les informations du nom de fichier pour le résumé
                parts = run_name.replace('.txt','').split('_')
                if len(parts) >= 3:
                    index_type, query_type, model_type = parts[0], parts[1], parts[2]
                    # Gérer le tag RM3 s'il est présent (pour l'évaluation finale)
                    if len(parts) > 3 and parts[-1] == 'rm3':
                         model_type = "_".join(parts[2:]) # Ex: BM25_RM3 ou QLD_RM3
                    else:
                         model_type = "_".join(parts[2:]) # Ex: BM25 ou QLD

                    # Ajouter les résultats au résumé
                    results_summary.append({
                        "Run Name": run_name, "Index": index_type,
                        "Query Type": query_type.capitalize(),
                        "Weighting Scheme": model_type.upper().replace('_', '+'), # Formatage pour affichage
                        "MAP": avg_map, "P@10": avg_p10
                    })
                else: print(f"  Avertissement: Impossible parser nom run '{run_name}'.")

            except FileNotFoundError: print(f"  Erreur: Fichier run non trouvé: {run_file}")
            except Exception as e: print(f"  Erreur évaluation {run_name}: {e}"); traceback.print_exc()

        # Afficher et sauvegarder le résumé final
        if results_summary:
            print("\n\n=== Tableau Récapitulatif des Résultats (BM25/QLD) ===")
            results_df = pd.DataFrame(results_summary)
            # Trier pour une meilleure lisibilité
            results_df = results_df.sort_values(by=["Index", "Query Type", "Weighting Scheme"])

            # Afficher le DataFrame complet
            print("\n--- Résultats Complets ---")
            print(results_df.to_markdown(index=False, floatfmt=".4f"))

            # Essayer d'afficher les tableaux pivots
            try:
                pivot_map = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='MAP')
                print("\n--- MAP (Tableau Pivot) ---")
                print(pivot_map.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot MAP: {e_pivot})")

            try:
                pivot_p10 = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='P@10')
                print("\n--- P@10 (Tableau Pivot) ---")
                print(pivot_p10.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot P@10: {e_pivot})")

            # Sauvegarder le DataFrame complet final
            summary_file_path = os.path.join(EVAL_DIR, "evaluation_summary_final.csv")
            try:
                 results_df.to_csv(summary_file_path, index=False)
                 print(f"\nTableau récapitulatif complet sauvegardé: {summary_file_path}")
            except Exception as e_save: print(f"\nErreur sauvegarde résumé: {e_save}")
        else: print("\nAucun résultat d'évaluation à afficher.")



# --- Nouvelle Cellule ---

# === Cellule 7: Exécuter la Recherche Améliorée (RM3) ===
# Applique RM3 sur la meilleure configuration de base identifiée à l'étape 6.
# !! N'OUBLIEZ PAS DE CONFIGURER LES VARIABLES BEST_... CI-DESSOUS !!

from pyserini.search.lucene import LuceneSearcher
from jnius import autoclass, JavaException
from tqdm.notebook import tqdm
import time
import traceback
import os

# Recharger ClassicSimilarity n'est plus nécessaire car on utilise BM25/QLD
# try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
# except Exception: ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; EVAL_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise

# --- À CONFIGURER selon vos meilleurs résultats de l'Étape 6 (BM25/QLD) ---
# !! MODIFIEZ CECI EN FONCTION DE VOS RÉSULTATS D'ÉVALUATION !!
print("--- Configuration RM3 ---")
print("Veuillez éditer les variables BEST_... ci-dessous en fonction de vos meilleurs résultats MAP de l'étape précédente.")
# Exemple: si preproc + long + bm25 était le meilleur
BEST_INDEX_PATH = INDEX_DIR_PREPROC           # Ex: INDEX_DIR_BASELINE ou INDEX_DIR_PREPROC
BEST_QUERIES = queries_long_preprocessed      # Ex: queries_short, queries_long, ..._preprocessed
BEST_MODEL_BASE = 'bm25'                      # Ex: 'bm25' ou 'qld' (celui qui a donné le meilleur MAP)
BEST_RUN_TAG_PREFIX = "preproc_long"          # Ex: 'baseline_short', 'preproc_long'
USE_PREPROC_QUERY_FOR_RM3 = False             # Généralement False si BEST_QUERIES est déjà prétraité
# ----------------------------------------------------------------
print(f"Configuration choisie pour RM3:")
print(f"  Index: {os.path.basename(BEST_INDEX_PATH)}")
# print(f"  Requêtes: (variable BEST_QUERIES)") # Difficile d'afficher le nom de la variable
print(f"  Modèle Base: {BEST_MODEL_BASE}")
print(f"  Préfixe Tag: {BEST_RUN_TAG_PREFIX}")
print(f"  Utiliser Preproc Requête?: {USE_PREPROC_QUERY_FOR_RM3}")

# Nom du fichier et tag pour le run RM3
PRF_RUN_FILE = os.path.join(RUN_DIR, f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3.txt")
RM3_RUN_TAG = f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3"

# Paramètres RM3
rm3_config = {'fb_terms': 10, 'fb_docs': 10, 'original_query_weight': 0.5}
print(f"  Paramètres RM3: {rm3_config}")

# --- Fonction de recherche RM3 (séquentielle) ---
def perform_search_sequential_rm3(queries, index_path, model_base, k, output_run_file, run_tag, use_preprocessed_query=False, rm3_params=None):
    """Exécute la recherche RM3 séquentiellement."""
    start_time = time.time()
    print(f"\nDébut recherche SÉQUENTIELLE RM3: Modèle='{model_base}+RM3', Tag='{run_tag}', k={k}")
    all_results_list = []
    searcher = None
    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        # Configurer similarité base
        if model_base == 'bm25': print("  Config BM25 (base)..."); searcher.set_bm25(k1=0.9, b=0.4)
        elif model_base == 'qld': print("  Config QLD (base)..."); searcher.set_qld()
        else: print(f"Modèle base '{model_base}' non reconnu, utilise BM25."); searcher.set_bm25()
        # Activer RM3
        print("  Activation RM3..."); searcher.set_rm3(**rm3_params); print("  RM3 activé.")
        # Itérer sur requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue
                hits = searcher.search(search_text, k=k)
                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche RM3 QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche RM3...")
        # Écrire résultats
        if all_results_list:
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True) # Assurer que le dossier existe
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat RM3 généré.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")
        end_time = time.time()
        print(f"Recherche RM3 terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run RM3 {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# Lancer la recherche RM3 (après configuration des variables BEST_...)
print("\nLancement de la recherche RM3...")
perform_search_sequential_rm3(
    BEST_QUERIES, BEST_INDEX_PATH, BEST_MODEL_BASE, K_RESULTS,
    PRF_RUN_FILE, RM3_RUN_TAG,
    use_preprocessed_query=USE_PREPROC_QUERY_FOR_RM3, rm3_params=rm3_config
)

print("\n--- Exécution de la recherche RM3 terminée. ---")
# Vérifier si le fichier a été créé
print(f"\nVérification de la création du fichier {PRF_RUN_FILE}...")
!ls -l "{PRF_RUN_FILE}"



# --- Nouvelle Cellule ---

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download("punkt")

ps = PorterStemmer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    filtered = [ps.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(filtered)

# --- Nouvelle Cellule ---

!pip install pysolr


# --- Nouvelle Cellule ---

# === Cellule 0.2: Installation des bibliothèques ===
# Pyserini nécessite Java 11, installons-le
!apt-get update -qq > /dev/null && apt-get install -y openjdk-11-jdk-headless -qq > /dev/null

# Installer Pyserini, NLTK et Pytrec_eval
!pip install pyserini==0.24.0 -q # Installe une version spécifique pour la stabilité
!pip install nltk -q
!pip install pytrec_eval -q

# Définir la variable d'environnement JAVA_HOME
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

# Télécharger les ressources NLTK nécessaires
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) # Ajouté pour WordNet

print("Installation terminée et ressources NLTK téléchargées.")

# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin dans la Cellule 0.1 et 0.3.")

AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "/content/drive/MyDrive/Projet_RI/AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "/content/drive/MyDrive/Projet_RI/topics/")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "/content/drive/MyDrive/Projet_RI/ql/")

# Chemins pour les sorties (index, résultats, etc.) dans l'environnement Colab
OUTPUT_DIR = "/content/drive/MyDrive/Projet_RI/output/"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/pre")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/Corpus") # Pour les documents extraits/formatés
RUN_DIR = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/runs") # Pour les fichiers de résultats TREC
EVAL_DIR = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/eval") # Pour les fichiers d'évaluation

# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

print(f"Chemin du projet Drive: {DRIVE_PROJECT_PATH}")
print(f"Répertoire de sortie Colab: {OUTPUT_DIR}")

# --- Nouvelle Cellule ---

# === Cellule 0.4: Extraire et Formater les Documents ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression

# Chemin vers le fichier JSONL qui sera généré
JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteur pour vérifier
doc_count = 0

# Ouvrir/créer le fichier JSONL de sortie
with open(JSONL_OUTPUT_PATH, 'w') as outfile, tarfile.open(AP_TAR_PATH, "r") as tar:  # Changed mode to "r" # Assurez-vous que c'est bien .gz
    # Itérer sur chaque membre (fichier/dossier) dans l'archive tar
    for member in tqdm(tar.getmembers(), desc="Traitement des fichiers TAR"):
        # Vérifier si c'est un fichier régulier
        if member.isfile():
            # Extraire le contenu du fichier
            f = tar.extractfile(member)
            if f: # S'assurer que l'extraction a réussi
                content = f.read().decode('utf-8', errors='ignore') # Lire et décoder

                # Trouver tous les documents dans le fichier actuel
                for doc_match in doc_pattern.finditer(content):
                    doc_content = doc_match.group(1)

                    # Extraire DOCNO
                    docno_match = docno_pattern.search(doc_content)
                    if not docno_match:
                        continue # Passer si pas de DOCNO
                    doc_id = docno_match.group(1).strip()

                    # Extraire TEXT (et le nettoyer un peu)
                    text_match = text_pattern.search(doc_content)
                    if text_match:
                       doc_text = text_match.group(1).strip()
                       # Nettoyage simple: remplacer les nouvelles lignes par des espaces
                       doc_text = ' '.join(doc_text.split())
                    else:
                        doc_text = "" # Mettre une chaîne vide si pas de champ TEXT

                    # Écrire l'entrée JSONL
                    json_line = json.dumps({"id": doc_id, "contents": doc_text})
                    outfile.write(json_line + '\n')
                    doc_count += 1

print(f"Terminé. {doc_count} documents formatés dans {JSONL_OUTPUT_PATH}")
# Note: La collection AP88-90 contient environ 164 597 documents. Vérifiez si ce nombre est proche.
# Si AP.tar.gz contient des sous-dossiers (ap88, ap89, etc.), ce code devrait fonctionner.
# Si AP.tar.gz contient directement les fichiers ap88xxxx, cela fonctionnera aussi.
# Si c'est juste AP.tar (non compressé), changez "r:gz" en "r:"

# --- Nouvelle Cellule ---

# === Cellule 1.1: Fonction de Prétraitement ===
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Applique la tokenisation, la mise en minuscule, la suppression
    de la ponctuation, la suppression des stop words et la lemmatisation.
    """
    # Tokenisation et minuscules
    tokens = word_tokenize(text.lower())

    # Suppression ponctuation et mots non alphabétiques + stop words
    filtered_tokens = [
        lemmatizer.lemmatize(w) for w in tokens
        if w.isalpha() and w not in stop_words # Garde seulement les mots alphabétiques non-stop words
    ]

    # Rejoint les tokens en une chaîne de caractères
    return ' '.join(filtered_tokens)

# Exemple d'utilisation
sample_text = "This is an example showing Information Retrieval with lemmatization and stop words removal."
preprocessed_sample = preprocess_text(sample_text)
print(f"Original: {sample_text}")
print(f"Preprocessed: {preprocessed_sample}")

# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
print(f"Collection source: {JSONL_OUTPUT_PATH}")
print(f"Répertoire de l'index: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# -input: dossier contenant les fichiers JSONL
# -collection: type de collection (JsonCollection pour nos fichiers .jsonl)
# -generator: comment traiter les fichiers (LuceneDocumentGenerator crée un document par ligne JSON)
# -index: chemin où sauvegarder l'index
# -threads: nombre de threads à utiliser (ajustez selon les ressources Colab, 4 est raisonnable)
# -storePositions -storeDocvectors -storeRaw: stocke informations supplémentaires utiles pour certaines recherches avancées (comme le re-ranking ou PRF)
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input {CORPUS_DIR} \
  --index {INDEX_DIR_BASELINE} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions --storeDocvectors --storeRaw

print(f"Indexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")

# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées vers {JSONL_PREPROC_PATH}...")

doc_count_preproc = 0
# Lire le fichier JSONL original et écrire le fichier prétraité
with open(JSONL_OUTPUT_PATH, 'r') as infile, open(JSONL_PREPROC_PATH, 'w') as outfile:
    for line in tqdm(infile, desc="Prétraitement des documents"):
        try:
            data = json.loads(line)
            doc_id = data['id']
            original_contents = data['contents']

            # Appliquer le prétraitement
            preprocessed_contents = preprocess_text(original_contents)

            # Écrire la nouvelle ligne JSONL
            json_line = json.dumps({"id": doc_id, "contents": preprocessed_contents})
            outfile.write(json_line + '\n')
            doc_count_preproc += 1
        except json.JSONDecodeError:
            print(f"Erreur de décodage JSON sur une ligne, ignorée.") # Au cas où une ligne serait malformée
        except Exception as e:
            print(f"Erreur inattendue lors du prétraitement: {e}") # Autres erreurs possibles

print(f"Terminé. {doc_count_preproc} documents prétraités dans {JSONL_PREPROC_PATH}")

# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées vers {JSONL_PREPROC_PATH}...")

doc_count_preproc = 0
# Lire le fichier JSONL original et écrire le fichier prétraité
with open(JSONL_OUTPUT_PATH, 'r') as infile, open(JSONL_PREPROC_PATH, 'w') as outfile:
    for line in tqdm(infile, desc="Prétraitement des documents"):
        try:
            data = json.loads(line)
            doc_id = data['id']
            original_contents = data['contents']

            # Appliquer le prétraitement
            preprocessed_contents = preprocess_text(original_contents)

            # Écrire la nouvelle ligne JSONL
            json_line = json.dumps({"id": doc_id, "contents": preprocessed_contents})
            outfile.write(json_line + '\n')
            doc_count_preproc += 1
        except json.JSONDecodeError:
            print(f"Erreur de décodage JSON sur une ligne, ignorée.") # Au cas où une ligne serait malformée
        except Exception as e:
            print(f"Erreur inattendue lors du prétraitement: {e}") # Autres erreurs possibles

print(f"Terminé. {doc_count_preproc} documents prétraités dans {JSONL_PREPROC_PATH}")

# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
print(f"Début de l'indexation avec Prétraitement...")
print(f"Collection source: {JSONL_PREPROC_PATH}") # Utilise le fichier .jsonl prétraité
print(f"Répertoire de l'index: {INDEX_DIR_PREPROC}")

# La commande est identique, mais pointe vers le fichier JSONL prétraité
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input {CORPUS_DIR} \
  --index {INDEX_DIR_PREPROC} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions --storeDocvectors --storeRaw \
  --pretokenized # Important: Indique que le texte est déjà tokenisé (évite une re-tokenisation par Lucene)

print(f"Indexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")

# --- Nouvelle Cellule ---

# === Cellule 2.1: Parser les Fichiers Topics ===
import glob # Pour trouver les fichiers correspondant à un pattern

def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        # Utilise regex pour trouver chaque bloc <top>
        for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
            topic_content = top_match.group(1)
            # Extrait le numéro (num)
            num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
            if not num_match: continue
            topic_id = num_match.group(1).strip()

            # Extrait le titre (title) - prend tout après <title> jusqu'au prochain tag
            title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
            title = title_match.group(1).strip() if title_match else ""

            # Extrait la description (desc)
            desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
            desc = desc_match.group(1).strip() if desc_match else ""

            # Extrait la narrative (narr) - pas utilisée ici mais pourrait l'être
            # narr_match = re.search(r"<narr>\s*Narrative:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
            # narr = narr_match.group(1).strip() if narr_match else ""

            if topic_id and title: # Au moins un ID et un titre
                 topics[topic_id] = {'title': title, 'desc': desc}
    return topics

# Trouver tous les fichiers topics
topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
print(f"Fichiers topics trouvés: {topic_files}")

all_topics = {}
for tf in topic_files:
    print(f"Parsing {tf}...")
    all_topics.update(parse_topics(tf))

print(f"Total de {len(all_topics)} topics parsés.")

# Créer les dictionnaires de requêtes courtes et longues
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()} # Concatène titre et description

# Optionnel: Créer des versions prétraitées des requêtes
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}

print(f"Exemple Requête Courte (ID 51): {queries_short.get('51', 'Non trouvé')}")
print(f"Exemple Requête Longue (ID 51): {queries_long.get('51', 'Non trouvé')}")
print(f"Exemple Requête Courte Prétraitée (ID 51): {queries_short_preprocessed.get('51', 'Non trouvé')}")
print(f"Exemple Requête Longue Prétraitée (ID 51): {queries_long_preprocessed.get('51', 'Non trouvé')}")

# --- Nouvelle Cellule ---

# === Cellule 3.1: Fonction de Recherche et Sauvegarde ===
from pyserini.search.lucene import LuceneSearcher
import time
from multiprocessing import Pool, cpu_count

# --- Configuration des modèles de similarité ---
# Pyserini/Lucene utilise BM25 par défaut (avec k1=0.9, b=0.4)
# Pour TF-IDF, nous utilisons ClassicSimilarity de Lucene.
# Cela nécessite d'importer la classe Java via Pyjnius (le pont Python-Java de Pyserini)
from jnius import autoclass
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_single_query(args):
    """Fonction exécutée par chaque processus pour une seule requête."""
    query_id, query_text, index_path, model, k, run_tag, use_preprocessed_query = args

    try:
        # Initialiser le searcher DANS le processus fils
        searcher = LuceneSearcher(index_path)

        # Configurer le modèle de similarité
        if model == 'bm25':
            # Utiliser les valeurs par défaut de Pyserini ou spécifier les vôtres
            searcher.set_bm25(k1=0.9, b=0.4) # Valeurs standard BM25 TREC
        elif model == 'tfidf':
            searcher.set_similarity(ClassicSimilarity()) # Appliquer TF-IDF (ClassicSimilarity)
        else:
            # Par défaut ou erreur
            searcher.set_bm25() # Rétablir BM25 par sécurité

        # Prétraiter la requête si nécessaire (pour l'index prétraité)
        search_text = preprocess_text(query_text) if use_preprocessed_query else query_text

        # Exécuter la recherche
        hits = searcher.search(search_text, k=k)

        # Formater les résultats pour cette requête
        query_results = []
        for i in range(len(hits)):
            rank = i + 1
            doc_id = hits[i].docid
            score = hits[i].score
            # Format TREC: qid Q0 docid rank score run_tag
            query_results.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

        return query_results

    except Exception as e:
        print(f"Erreur lors de la recherche pour QID {query_id} avec {run_tag}: {e}")
        return [] # Retourne une liste vide en cas d'erreur


def run_search_parallel(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche en parallèle pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    # Préparer les arguments pour chaque tâche de processus
    tasks = []
    for query_id, query_text in queries.items():
        tasks.append((query_id, query_text, index_path, model, k, run_tag, use_preprocessed_query))

    # Utiliser un Pool de processus pour la parallélisation
    # Utiliser N-1 coeurs pour laisser un peu de marge, ou cpu_count()
    num_workers = max(1, cpu_count() - 1)
    print(f"Utilisation de {num_workers} processus parallèles...")

    all_results_list = []
    # Utiliser tqdm pour la barre de progression avec le Pool
    with Pool(num_workers) as pool:
       # pool.imap_unordered exécute les tâches et retourne les résultats dès qu'ils sont prêts
       # Cela peut être plus rapide si certaines requêtes prennent plus de temps
       results_iterator = pool.imap_unordered(perform_search_single_query, tasks)
       # Envelopper avec tqdm pour la barre de progression
       for result in tqdm(results_iterator, total=len(tasks), desc=f"Recherche {run_tag}"):
           all_results_list.extend(result) # Ajouter les lignes de résultats retournées par chaque processus


    # Écrire les résultats dans le fichier de run TREC
    with open(output_run_file, 'w') as f_out:
       f_out.writelines(all_results_list)

    end_time = time.time()
    print(f"Recherche terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
    print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")


# --- Exécution des différentes configurations ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête (standard TREC)

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
run_search_parallel(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
run_search_parallel(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
run_search_parallel(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
run_search_parallel(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# Important: Utiliser les requêtes prétraitées correspondantes

# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
run_search_parallel(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
 # Note: Les requêtes sont déjà prétraitées, donc use_preprocessed_query=False dans la fonction
 #       (car elle applique preprocess_text si True) - c'est un peu contre-intuitif
 #       Alternative: passer `queries_short` et mettre `use_preprocessed_query=True`. Choisissons la première option pour la clarté.

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
run_search_parallel(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
run_search_parallel(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
run_search_parallel(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base sont terminées.")

# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (Séquentielle d'abord) ===
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc()) # Affiche la trace complète de l'erreur Java
                 raise # Arrête l'exécution pour ce run si la similarité ne peut être définie
        else:
            print("  Configuration BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante

        # Écrire les résultats dans le fichier de run TREC
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        # Important: Fermer le searcher pour libérer les ressources Java, même en cas d'erreur
        if searcher:
             try:
                 # Note: Pyserini ne semble pas avoir de méthode close() explicite sur LuceneSearcher
                 # La JVM devrait se nettoyer, mais c'est une bonne pratique si disponible
                 # searcher.close() # Décommentez si une telle méthode existe dans votre version
                 print(f"  Nettoyage implicite des ressources pour {run_tag}.")
                 pass
             except Exception as e_close:
                 print(f"  Erreur lors de la tentative de fermeture du searcher pour {run_tag}: {e_close}")


# --- Exécution des différentes configurations (en mode séquentiel) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel) sont terminées.")

# --- Note importante ---
# Si cette cellule s'exécute sans planter (même si c'est lent),
# le problème est probablement lié à la parallélisation (mémoire/conflits JVM).
# Si elle plante encore, surtout lors des runs 'tfidf',
# le problème pourrait être lié à ClassicSimilarity ou à l'environnement Java lui-même.


# --- Nouvelle Cellule ---

!pip install pyserini

# --- Nouvelle Cellule ---

# === Cellule 1.1: Fonction de Prétraitement ===
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Applique la tokenisation, la mise en minuscule, la suppression
    de la ponctuation, la suppression des stop words et la lemmatisation.
    """
    # Tokenisation et minuscules
    tokens = word_tokenize(text.lower())

    # Suppression ponctuation et mots non alphabétiques + stop words
    filtered_tokens = [
        lemmatizer.lemmatize(w) for w in tokens
        if w.isalpha() and w not in stop_words # Garde seulement les mots alphabétiques non-stop words
    ]

    # Rejoint les tokens en une chaîne de caractères
    return ' '.join(filtered_tokens)

# Exemple d'utilisation
sample_text = "This is an example showing Information Retrieval with lemmatization and stop words removal."
preprocessed_sample = preprocess_text(sample_text)
print(f"Original: {sample_text}")
print(f"Preprocessed: {preprocessed_sample}")

# --- Nouvelle Cellule ---

import nltk
nltk.download('punkt_tab')

# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin dans la Cellule 0.1 et 0.3.")

# Corrected the path for AP_TAR_PATH by removing the extra DRIVE_PROJECT_PATH
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "topics/") #Corrected the path
QRELS_DIR = os.path.join

# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin dans la Cellule 0.1 et 0.3.")

# Corrected the path for AP_TAR_PATH by removing the extra DRIVE_PROJECT_PATH
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "topics/") #Corrected the path
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "ql/") #Corrected the path


# Chemins pour les sorties (index, résultats, etc.) dans l'environnement Colab
OUTPUT_DIR = os.path.join(DRIVE_PROJECT_PATH, "output/") #Corrected the path
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "baseline") #Corrected the path
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "pre") #Corrected the path
CORPUS_DIR = os.path.join(OUTPUT_DIR, "Corpus") # Pour les documents extraits/formatés
RUN_DIR = os.path.join(OUTPUT_DIR, "runs") # Pour les fichiers de résultats TREC
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval") # Pour les fichiers d'évaluation

# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

print(f"Chemin du projet Drive: {DRIVE_PROJECT_PATH}")
print(f"Répertoire de sortie Colab: {OUTPUT_DIR}")

# --- Nouvelle Cellule ---

with open(JSONL_OUTPUT_PATH, 'w') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:  # Changed mode to "r"

Pour citer ce code :

Loyer, Dominique. (2024). TREC_AP88-90_5juin25(Pyserini&Lucene).ipynb [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

tri2706.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import hashlib
import shutil
from pathlib import Path
import sys

# =============================================================================
# --- CONFIGURATION PRINCIPALE ---
# Vérifiez que ces chemins sont corrects.
# =============================================================================
HOME_DIR = Path.home()
SOURCE_DIR = HOME_DIR / "Desktop" / "TOUT_A_RECUPERER"
DESKTOP_DIR = HOME_DIR / "Desktop"
TARGET_DOCS_DIR = HOME_DIR / "°°Mes documents°°"
TARGET_DOWNLOADS_DIR = HOME_DIR / "Downloads"

# Les dossiers de sortie seront créés sur le Bureau pour un accès facile
OUTPUT_BASE_DIR = HOME_DIR / "Desktop" / "RESULTAT_DU_TRI"
DUPLICATES_DIR = OUTPUT_BASE_DIR / "Dossier_Doublons"
REPORT_FILE = OUTPUT_BASE_DIR / "FICHIERS_A_NOMMER_MANUELLEMENT.txt"

# =============================================================================
# --- FONCTIONS UTILITAIRES ---
# =============================================================================

def calculate_hash(filepath):
    """Calcule le hash SHA256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while (data := f.read(65536)):
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def is_human_readable(filename):
    """Vérifie si un nom de fichier semble lisible par un humain."""
    # Règle simple : a-t-il au moins quelques lettres et n'est-il pas une chaîne hexadécimale ?
    try:
        int(Path(filename).stem, 16)
        return False  # C'est probablement un nom non-lisible (ex: hash)
    except ValueError:
        return any(c.isalpha() for c in filename)

def safe_move(src_path, dest_dir, dry_run=False):
    """Déplace un fichier de manière sécurisée, en évitant les conflits."""
    dest_path = dest_dir / src_path.name
    if dry_run:
        print(f"[DRY RUN] Déplacerait '{src_path}' vers '{dest_dir}'")
        return
    try:
        if dest_path.exists():
            # Ne pas écraser, créer un nom unique
            i = 1
            while True:
                new_name = f"{dest_path.stem}_{i}{dest_path.suffix}"
                new_dest_path = dest_dir / new_name
                if not new_dest_path.exists():
                    shutil.move(str(src_path), str(new_dest_path))
                    break
                i += 1
        else:
            shutil.move(str(src_path), str(dest_path))
    except Exception as e:
        print(f"\n[ERREUR] Impossible de déplacer '{src_path}': {e}")

# =============================================================================
# --- FONCTION PRINCIPALE ---
# =============================================================================

def main(dry_run=False):
    """Exécute le processus de réintégration."""
    if dry_run:
        print("="*60)
        print("          MODE SIMULATION (DRY RUN) - AUCUN FICHIER NE SERA DÉPLACÉ")
        print("="*60)
    
    # --- Création des dossiers de sortie ---
    OUTPUT_BASE_DIR.mkdir(exist_ok=True)
    DUPLICATES_DIR.mkdir(exist_ok=True)

    # --- Phase 1: Indexation des fichiers existants (zones protégées) ---
    print("\n--- Phase 1: Indexation des fichiers existants (Bureau, Documents, Downloads)...")
    existing_files_index = {}  # {hash: [paths]}
    scan_zones = [DESKTOP_DIR, TARGET_DOCS_DIR, TARGET_DOWNLOADS_DIR]
    
    for zone in scan_zones:
        print(f"Analyse de '{zone}'...")
        for filepath in zone.rglob('*'):
            if filepath.is_file() and not any(part.startswith('.') for part in filepath.parts):
                file_hash = calculate_hash(filepath)
                if file_hash:
                    if file_hash not in existing_files_index:
                        existing_files_index[file_hash] = []
                    existing_files_index[file_hash].append(filepath)

    print(f"Indexation terminée. {len(existing_files_index)} fichiers uniques trouvés.")

    # --- Phase 2: Traitement de la zone source (TOUT_A_RECUPERER) ---
    print(f"\n--- Phase 2: Traitement du dossier source '{SOURCE_DIR}'...")
    if not SOURCE_DIR.is_dir():
        print(f"[ERREUR] Le dossier source n'existe pas. Abandon.")
        return

    files_to_process = list(SOURCE_DIR.rglob('*'))
    total_files = len(files_to_process)
    report_content = []

    for i, src_path in enumerate(files_to_process):
        print(f"\rTraitement des fichiers sources : {i+1}/{total_files}", end="")
        
        # Ignorer les dossiers et les fichiers cachés
        if not src_path.is_file() or src_path.name.startswith('.'):
            continue
            
        # Règle de sécurité : Ne jamais toucher à un dossier 'Library'
        if 'Library' in src_path.parts:
            continue

        # Vérifier la lisibilité du nom
        if not is_human_readable(src_path.name):
            report_content.append(str(src_path))
            continue

        # Calculer le hash du fichier source
        src_hash = calculate_hash(src_path)
        if not src_hash:
            continue

        # Vérifier si c'est un doublon d'un fichier déjà en place
        if src_hash in existing_files_index:
            safe_move(src_path, DUPLICATES_DIR, dry_run)
        else:
            # Si c'est un fichier unique, le réintégrer dans "Mes documents"
            safe_move(src_path, TARGET_DOCS_DIR, dry_run)
            # Mettre à jour l'index pour éviter de réintégrer des doublons de la source
            existing_files_index[src_hash] = [TARGET_DOCS_DIR / src_path.name]

    print("\nTraitement de la source terminé.")

    # --- Écriture du rapport pour les fichiers non lisibles ---
    if report_content:
        print(f"\n--- Phase 3: Écriture du rapport...")
        with open(REPORT_FILE, 'w', encoding='utf-8') as f:
            f.write("Les fichiers suivants ont des noms considérés comme non-lisibles et n'ont pas été déplacés.\n")
            f.write("Ils sont toujours dans le dossier 'TOUT_A_RECUPERER'.\n\n")
            for line in report_content:
                f.write(line + '\n')
        print(f"Rapport sauvegardé sur votre bureau : '{REPORT_FILE.name}'")

    print("\nOpération terminée.")
    if dry_run:
        print("\nSimulation terminée. Revoyez la sortie pour voir les actions qui auraient été prises.")
    else:
        print("\nLe tri est terminé. Le dossier 'TOUT_A_RECUPERER' devrait maintenant être beaucoup plus léger.")


if __name__ == "__main__":
    # --- Mode de fonctionnement ---
    # Par défaut, le script s'exécute en mode simulation.
    # Pour l'exécuter en mode réel, il faut ajouter l'argument --execute
    is_dry_run = True
    if len(sys.argv) > 1 and sys.argv[1] == '--execute':
        is_dry_run = False
        confirm = input(
            "ATTENTION : Vous êtes sur le point d'exécuter le script en mode RÉEL.\n"
            "Des fichiers seront déplacés de manière permanente.\n"
            "Tapez 'EXECUTER' en majuscules pour confirmer : "
        )
        if confirm != "EXECUTER":
            print("Confirmation non valide. Abandon de l'opération.")
            sys.exit()

    main(dry_run=is_dry_run)

Pour citer ce code :

Loyer, Dominique. (2024). tri2706.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

tri2706.py.tex

Erreur lors de la génération de la description.

Mots-clés: erreur, api

import os
import hashlib
import shutil
from pathlib import Path
import sys

# =============================================================================
# --- CONFIGURATION PRINCIPALE ---
# Vérifiez que ces chemins sont corrects.
# =============================================================================
HOME_DIR = Path.home()
SOURCE_DIR = HOME_DIR / "Desktop" / "TOUT_A_RECUPERER"
DESKTOP_DIR = HOME_DIR / "Desktop"
TARGET_DOCS_DIR = HOME_DIR / "°°Mes documents°°"
TARGET_DOWNLOADS_DIR = HOME_DIR / "Downloads"

# Les dossiers de sortie seront créés sur le Bureau pour un accès facile
OUTPUT_BASE_DIR = HOME_DIR / "Desktop" / "RESULTAT_DU_TRI"
DUPLICATES_DIR = OUTPUT_BASE_DIR / "Dossier_Doublons"
REPORT_FILE = OUTPUT_BASE_DIR / "FICHIERS_A_NOMMER_MANUELLEMENT.txt"

# =============================================================================
# --- FONCTIONS UTILITAIRES ---
# =============================================================================

def calculate_hash(filepath):
    """Calcule le hash SHA256 d'un fichier."""
    sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            while (data := f.read(65536)):
                sha256.update(data)
        return sha256.hexdigest()
    except IOError:
        return None

def is_human_readable(filename):
    """Vérifie si un nom de fichier semble lisible par un humain."""
    # Règle simple : a-t-il au moins quelques lettres et n'est-il pas une chaîne hexadécimale ?
    try:
        int(Path(filename).stem, 16)
        return False  # C'est probablement un nom non-lisible (ex: hash)
    except ValueError:
        return any(c.isalpha() for c in filename)

def safe_move(src_path, dest_dir, dry_run=False):
    """Déplace un fichier de manière sécurisée, en évitant les conflits."""
    dest_path = dest_dir / src_path.name
    if dry_run:
        print(f"[DRY RUN] Déplacerait '{src_path}' vers '{dest_dir}'")
        return
    try:
        if dest_path.exists():
            # Ne pas écraser, créer un nom unique
            i = 1
            while True:
                new_name = f"{dest_path.stem}_{i}{dest_path.suffix}"
                new_dest_path = dest_dir / new_name
                if not new_dest_path.exists():
                    shutil.move(str(src_path), str(new_dest_path))
                    break
                i += 1
        else:
            shutil.move(str(src_path), str(dest_path))
    except Exception as e:
        print(f"\n[ERREUR] Impossible de déplacer '{src_path}': {e}")

# =============================================================================
# --- FONCTION PRINCIPALE ---
# =============================================================================

def main(dry_run=False):
    """Exécute le processus de réintégration."""
    if dry_run:
        print("="*60)
        print("          MODE SIMULATION (DRY RUN) - AUCUN FICHIER NE SERA DÉPLACÉ")
        print("="*60)
    
    # --- Création des dossiers de sortie ---
    OUTPUT_BASE_DIR.mkdir(exist_ok=True)
    DUPLICATES_DIR.mkdir(exist_ok=True)

    # --- Phase 1: Indexation des fichiers existants (zones protégées) ---
    print("\n--- Phase 1: Indexation des fichiers existants (Bureau, Documents, Downloads)...")
    existing_files_index = {}  # {hash: [paths]}
    scan_zones = [DESKTOP_DIR, TARGET_DOCS_DIR, TARGET_DOWNLOADS_DIR]
    
    for zone in scan_zones:
        print(f"Analyse de '{zone}'...")
        for filepath in zone.rglob('*'):
            if filepath.is_file() and not any(part.startswith('.') for part in filepath.parts):
                file_hash = calculate_hash(filepath)
                if file_hash:
                    if file_hash not in existing_files_index:
                        existing_files_index[file_hash] = []
                    existing_files_index[file_hash].append(filepath)

    print(f"Indexation terminée. {len(existing_files_index)} fichiers uniques trouvés.")

    # --- Phase 2: Traitement de la zone source (TOUT_A_RECUPERER) ---
    print(f"\n--- Phase 2: Traitement du dossier source '{SOURCE_DIR}'...")
    if not SOURCE_DIR.is_dir():
        print(f"[ERREUR] Le dossier source n'existe pas. Abandon.")
        return

    files_to_process = list(SOURCE_DIR.rglob('*'))
    total_files = len(files_to_process)
    report_content = []

    for i, src_path in enumerate(files_to_process):
        print(f"\rTraitement des fichiers sources : {i+1}/{total_files}", end="")
        
        # Ignorer les dossiers et les fichiers cachés
        if not src_path.is_file() or src_path.name.startswith('.'):
            continue
            
        # Règle de sécurité : Ne jamais toucher à un dossier 'Library'
        if 'Library' in src_path.parts:
            continue

        # Vérifier la lisibilité du nom
        if not is_human_readable(src_path.name):
            report_content.append(str(src_path))
            continue

        # Calculer le hash du fichier source
        src_hash = calculate_hash(src_path)
        if not src_hash:
            continue

        # Vérifier si c'est un doublon d'un fichier déjà en place
        if src_hash in existing_files_index:
            safe_move(src_path, DUPLICATES_DIR, dry_run)
        else:
            # Si c'est un fichier unique, le réintégrer dans "Mes documents"
            safe_move(src_path, TARGET_DOCS_DIR, dry_run)
            # Mettre à jour l'index pour éviter de réintégrer des doublons de la source
            existing_files_index[src_hash] = [TARGET_DOCS_DIR / src_path.name]

    print("\nTraitement de la source terminé.")

    # --- Écriture du rapport pour les fichiers non lisibles ---
    if report_content:
        print(f"\n--- Phase 3: Écriture du rapport...")
        with open(REPORT_FILE, 'w', encoding='utf-8') as f:
            f.write("Les fichiers suivants ont des noms considérés comme non-lisibles et n'ont pas été déplacés.\n")
            f.write("Ils sont toujours dans le dossier 'TOUT_A_RECUPERER'.\n\n")
            for line in report_content:
                f.write(line + '\n')
        print(f"Rapport sauvegardé sur votre bureau : '{REPORT_FILE.name}'")

    print("\nOpération terminée.")
    if dry_run:
        print("\nSimulation terminée. Revoyez la sortie pour voir les actions qui auraient été prises.")
    else:
        print("\nLe tri est terminé. Le dossier 'TOUT_A_RECUPERER' devrait maintenant être beaucoup plus léger.")


if __name__ == "__main__":
    # --- Mode de fonctionnement ---
    # Par défaut, le script s'exécute en mode simulation.
    # Pour l'exécuter en mode réel, il faut ajouter l'argument --execute
    is_dry_run = True
    if len(sys.argv) > 1 and sys.argv[1] == '--execute':
        is_dry_run = False
        confirm = input(
            "ATTENTION : Vous êtes sur le point d'exécuter le script en mode RÉEL.\n"
            "Des fichiers seront déplacés de manière permanente.\n"
            "Tapez 'EXECUTER' en majuscules pour confirmer : "
        )
        if confirm != "EXECUTER":
            print("Confirmation non valide. Abandon de l'opération.")
            sys.exit()

    main(dry_run=is_dry_run)

Pour citer ce code :

Loyer, Dominique. (2024). tri2706.py.tex [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

TweetExtractionCode_va.r

Erreur lors de la génération de la description.

Mots-clés: erreur, api

##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("C:/Users/rpand/Desktop/Documents/Classes/Classes/Sentiment_Accelerator")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentiment)
library(RCurl)
library(syuzhet)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

#connect to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### Twitter Application
consumerKey="JhgqXWTJcUUIanhL5PGZndtgt"
consumerSecret="JPyI4cm3yEtSPNODKSefV58SmjiS3ybq8xav3D80F0GDd0jCpK"
accesstoken="4801685942-YJ0itSN5kTUWtAI8bL4DR2R6dC9TYwcjFMHIiL8"
accesssecret="ZCAKUxqAYAeQizVHAszTIHkYf4KjL8e9K2FTjEnWspaJd"

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) #There is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC

save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
#Once you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumer_key=consumerKey, consumer_secret=consumerSecret, access_token =accesstoken, access_secret = accesssecret )



##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = searchTwitter("india fintech", n=1000, since = "2016-01-01", lang= "en")

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(mysentiment[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")

Pour citer ce code :

Loyer, Dominique. (2024). TweetExtractionCode_va.r [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

tweets.R

Erreur lors de la génération de la description.

Mots-clés: erreur, api

##**********Steps to Set up authorization to connect and extract tweets********
### Setting Working Directory
setwd("/Users/SherbrookeInformatique/Bureau")


library(twitteR)
library(ROAuth)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(wordcloud)
library(sentiment)
library(RCurl)
library(syuzhet)
library(sentimentr)
# load twitter library - the rtweet library is recommended now over twitteR
library(rtweet)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)

install.packages("httpuv")
library(httpuv)



### Fill out my tokens
consumer_key=""
consumer_secret=""
access_token=""
access_secret=""



setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
ytw = twitteR::searchTwitter('#realDonaldTrump + #HillaryClinton', n = 6, since = '2016-11-08', retryOnRateLimit = 1e3)
d = twitteR::twListToDF(tw)

oauth_endpoint(authorize = "https://api.twitter.com/oauth",
               access = "https://api.twitter.com/oauth/access_token")

#connect to API
download.file(url='http://curl.haxx.se/ca/cacert.pem', destfile='cacert.pem')
reqURL <- 'https://api.twitter.com/oauth/request_token'
accessURL <- 'https://api.twitter.com/oauth/access_token'
authURL <- 'https://api.twitter.com/oauth/authorize'

### put my credientials here
consumer_key=""
consumer_secret=""
access_token=""
access_token_secret=""

Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=reqURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo = system.file('CurlSSL', 'cacert.pem', package = 'RCurl')) #There is URL in Console. You need to go to it, get code and enter it on Console

##### Authorization PIN -DYNAMIC


Elec = search_tweets("#Election2020", n=100000, lang='en', include_rts=FALSE)
Elec





save(Cred, file='twitter authentication.Rdata')

load('twitter authentication.Rdata') 
#Once you launch the code first time, you can start from this line in the future (libraries should be connected)

setup_twitter_oauth(consumer_key=consumerKey, consumer_secret=consumerSecret, access_token =accesstoken, access_secret = accesssecret )



##****************Step 3: Perform tweets extraction and data cleaning****************

# Harvest some tweets

some_tweets = searchTwitter("Election2020", n=1000, since = "2020-11-03", lang= "en")

# Explore Tweets

length.some_tweets <- length(some_tweets)
length.some_tweets

some_tweets.df <- ldply(some_tweets, function(t) t$toDataFrame())
write.csv(some_tweets.df, "tweets.csv")

# get the text
some_txt = sapply(some_tweets, function(x) x$getText())

# Cleaning 1-  remove people name, RT text etc. 

some_txt1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",some_txt)

# Cleaning 2- remove html links
some_txt2 = gsub("http[^[:blank:]]+", "", some_txt1)

# Cleaning 3- remove people names

some_txt3 = gsub("@\\w+", "", some_txt2)

# Cleaning 4- remove Punctuations 

some_txt4 = gsub("[[:punct:]]", " ", some_txt3)

# Cleaning 5- remove Punctuations 

some_txt5 = gsub("[^[:alnum:]]", " ", some_txt4)

# Exporting to Excel

write.csv(some_txt5, "tweets1.csv")

# Creating wordcorpus and cleaning

some_txt6 <- Corpus(VectorSource(some_txt5))
some_txt6 <- tm_map(some_txt6, removePunctuation)
some_txt6 <- tm_map(some_txt6, content_transformer(tolower))
some_txt6 <- tm_map(some_txt6, removeWords, stopwords("english"))
some_txt6 <- tm_map(some_txt6, stripWhitespace)

# Building wordcloud

pal <- brewer.pal(12,"Dark2")

wordcloud(some_txt6, min.freq = 5,  max.words = Inf, width=1000, height =1000,  random.order = FALSE, color=pal)

y# Sentiment Analysis

# how the function works

get_nrc_sentiment("I bought an iPhone a few days ago. It is such a nice phone, although a little large. The touch screen is cool.The voice quality is clear too. I simply love it!")

# Running on our data

mysentiment <- get_nrc_sentiment(some_txt5)
SentimentScores <- data.frame(colSums(some_txt5[,]))
names(SentimentScores) <- "Score"
SentimentScores <- cbind("sentiment" = rownames(SentimentScores), SentimentScores)
rownames(SentimentScores) <- NULL
ggplot(data = SentimentScores, aes(x = sentiment, y = Score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Score") + ggtitle("Total Sentiment Score Based on Tweets")

Pour citer ce code :

Loyer, Dominique. (2024). tweets.R [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

US_Election_2020 copie.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

from tweepy.streaming import StreamListener  
from tweepy import OAuthHandler  
from tweepy import Stream  
   
import twitter_credentials  
   
# # # # TWITTER STREAMER # # # #  
class TwitterStreamer():  
    """  
    Class for streaming and processing live tweets.    """    def __init__(self):  
        pass  
  
    def stream_tweets(self, fetched_tweets_filename, hash_tag_list):  
        # This handles Twitter authetification and the connection to Twitter Streaming API  
        listener = StdOutListener(fetched_tweets_filename)  
        auth = OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)  
        auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)  
        stream = Stream(auth, listener)  
  
        # This line filter Twitter Streams to capture data by the keywords:   
stream.filter(track=hash_tag_list)  
  
  
# # # # TWITTER STREAM LISTENER # # # #  
class StdOutListener(StreamListener):  
    """  
    This is a basic listener that just prints received tweets to stdout.    """    def __init__(self, fetched_tweets_filename):  
        self.fetched_tweets_filename = fetched_tweets_filename  
  
    def on_data(self, data):  
        try:  
            print(data)  
            with open(self.fetched_tweets_filename, 'a') as tf:  
                tf.write(data)  
            return True  
        except BaseException as e:  
            print("Error on_data %s" % str(e))  
        return True  
            
    def on_error(self, status):  
        print(status)  
  
   
if __name__ == '__main__':  
   
    # Authenticate using config.py and connect to Twitter Streaming API.  
    hash_tag_list = ["donal trump", "hillary clinton", "barack obama", "bernie sanders"]  
    fetched_tweets_filename = "tweets.txt"  
  
    twitter_streamer = TwitterStreamer()  
    twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)

Pour citer ce code :

Loyer, Dominique. (2024). US_Election_2020 copie.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

US_Election_2020.py

Erreur lors de la génération de la description.

Mots-clés: erreur, api

from tweepy.streaming import StreamListener  
from tweepy import OAuthHandler  
from tweepy import Stream  
   
import twitter_credentials  
   
# # # # TWITTER STREAMER # # # #  
class TwitterStreamer():  
    """  
    Class for streaming and processing live tweets.    """    def __init__(self):  
        pass  
  
    def stream_tweets(self, fetched_tweets_filename, hash_tag_list):  
        # This handles Twitter authetification and the connection to Twitter Streaming API  
        listener = StdOutListener(fetched_tweets_filename)  
        auth = OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)  
        auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)  
        stream = Stream(auth, listener)  
  
        # This line filter Twitter Streams to capture data by the keywords:   
stream.filter(track=hash_tag_list)  
  
  
# # # # TWITTER STREAM LISTENER # # # #  
class StdOutListener(StreamListener):  
    """  
    This is a basic listener that just prints received tweets to stdout.    """    def __init__(self, fetched_tweets_filename):  
        self.fetched_tweets_filename = fetched_tweets_filename  
  
    def on_data(self, data):  
        try:  
            print(data)  
            with open(self.fetched_tweets_filename, 'a') as tf:  
                tf.write(data)  
            return True  
        except BaseException as e:  
            print("Error on_data %s" % str(e))  
        return True  
            
    def on_error(self, status):  
        print(status)  
  
   
if __name__ == '__main__':  
   
    # Authenticate using config.py and connect to Twitter Streaming API.  
    hash_tag_list = ["donal trump", "hillary clinton", "barack obama", "bernie sanders"]  
    fetched_tweets_filename = "tweets.txt"  
  
    twitter_streamer = TwitterStreamer()  
    twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)

Pour citer ce code :

Loyer, Dominique. (2024). US_Election_2020.py [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

v1832.ipynb

Erreur lors de la génération de la description.

Mots-clés: erreur, api

# === Cellule 0.1: Monter Google Drive ===
from google.colab import drive
drive.mount('/content/drive')

# Vérifiez que le dossier du projet est accessible
# Adaptez le chemin si nécessaire en fonction de l'emplacement dans votre Drive
!ls "/content/drive/MyDrive/Projet_RI"

# --- Nouvelle Cellule ---

# === Cellule de Vérification du Contenu du Dossier Runs (Corrigé) ===
# Utilise les commandes shell de Colab préfixées par '!'

# Chemin exact où les résultats de recherche sont attendus
# (Défini dans la cellule de configuration complète)
RUN_DIR_PATH="/content/ap_output/runs/"

# Utiliser '!' pour exécuter la commande shell 'echo'
print(f"Vérification du contenu de : {RUN_DIR_PATH}")

# Utiliser '!' pour exécuter la commande shell 'ls -lh'
# Mettre le chemin entre guillemets pour gérer les espaces potentiels (même s'il n'y en a pas ici)
!ls -lh "{RUN_DIR_PATH}"


# --- Nouvelle Cellule ---

# === Cellule 4: Exécuter les Recherches (Séquentielles - BM25 & QLD) ===
# Lance les 8 combinaisons de recherche en utilisant BM25 et QLD.
# S'assure que l'environnement Java 21 est actif et que les index/variables sont définis/restaurés.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import JavaException # Importer seulement JavaException, ClassicSimilarity n'est pas utilisé

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index restaurés
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; CORPUS_DIR; # Ajout CORPUS_DIR pour vérif jsonl
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline restauré manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed restauré manquant: {INDEX_DIR_PREPROC}")
    # Vérifier aussi que les fichiers de corpus sont là (restaurés ou recréés)
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs.jsonl")): raise FileNotFoundError("ap_docs.jsonl manquant.")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")): raise FileNotFoundError("ap_docs_preprocessed.jsonl manquant.")

except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential_qld(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes (BM25 ou QLD)."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")

        # Configurer similarité
        if model == 'bm25':
            print("  Configuration BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'qld': # Utiliser Query Likelihood Dirichlet
            print("  Configuration QLD..."); searcher.set_qld(); print("  QLD configuré.")
        else:
            print(f"Modèle '{model}' non reconnu, utilise BM25 par défaut."); searcher.set_bm25()

        # Itérer sur les requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # S'assurer que le dossier RUN_DIR existe avant d'écrire
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations (BM25 et QLD) ---
print("\n--- DÉBUT DES RECHERCHES BASELINE (BM25/QLD) ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_qld.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_qld.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES (BM25/QLD) ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_qld.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_qld.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (BM25/QLD) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}



# --- Nouvelle Cellule ---

# === Cellule 4: Exécuter les Recherches (Séquentielles - BM25 & QLD) ===
# Lance les 8 combinaisons de recherche en utilisant BM25 et QLD.
# S'assure que l'environnement Java 21 est actif et que les index/variables sont définis/restaurés.

# Assurer que pyserini est installé avant l'import
# Vous devriez normalement exécuter la Cellule 0 "Configuration Complète" avant celle-ci.
# Cette ligne est ajoutée comme filet de sécurité si la Cellule 0 n'a pas été exécutée
# ou a échoué pour pyserini. Supprimez-la si vous exécutez toujours la Cellule 0.
!pip install pyserini --quiet

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import JavaException # Importer seulement JavaException, ClassicSimilarity n'est pas utilisé

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index restaurés
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; CORPUS_DIR; # Ajout CORPUS_DIR pour vérif jsonl
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline restauré manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed restauré manquant: {INDEX_DIR_PREPROC}")
    # Vérifier aussi que les fichiers de corpus sont là (restaurés ou recréés)
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs.jsonl")): raise FileNotFoundError("ap_docs.jsonl manquant.")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")): raise FileNotFoundError("ap_docs_preprocessed.jsonl manquant.")

except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential_qld(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes (BM25 ou QLD)."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")

        # Configurer similarité
        if model == 'bm25':
            print("  Configuration BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'qld': # Utiliser Query Likelihood Dirichlet
            print("  Configuration QLD..."); searcher.set_qld(); print("  QLD configuré.")
        else:
            print(f"Modèle '{model}' non reconnu, utilise BM25 par défaut."); searcher.set_bm25()

        # Itérer sur les requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # S'assurer que le dossier RUN_DIR existe avant d'écrire
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations (BM25 et QLD) ---
print("\n--- DÉBUT DES RECHERCHES BASELINE (BM25/QLD) ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_qld.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_qld.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES (BM25/QLD) ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_qld.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_qld.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (BM25/QLD) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}

# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (avec Stemming) ===
# Installe Java 21, configure comme défaut, installe outils build,
# pybind11, dernière Pyserini, NLTK+ressources, définit chemins,
# FONCTION preprocess_text AVEC STEMMING, parse topics.

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète (avec Stemming) ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
# S'assurer que nltk est importé
import nltk
# Liste incluant la correction pour punkt_tab
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab']
for resource in nltk_resources:
    try:
        # Déterminer le chemin de recherche correct pour nltk.data.find
        if resource == 'punkt' or resource == 'punkt_tab': # punkt_tab est aussi dans tokenizers
            resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4':
             resource_path = f'corpora/{resource}.zip' # Open Multilingual Wordnet
        elif resource == 'wordnet':
             resource_path = f'corpora/{resource}.zip'
        else: # stopwords, etc.
            resource_path = f'corpora/{resource}.zip'

        # Essayer de trouver la ressource
        nltk.data.find(resource_path)
        # print(f"  Ressource NLTK '{resource}' déjà présente.")

    # Utiliser except LookupError (correction appliquée)
    except LookupError:
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try:
            nltk.download(resource, quiet=True)
            print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download:
            # Capturer les erreurs potentielles de téléchargement (réseau, etc.)
            print(f"  ERREUR lors du téléchargement de '{resource}': {e_download}")

print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive (corrigé)
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Sera recréé avec stemming
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement (AVEC STEMMING) ---
print("\n[8/9] Définition de la fonction preprocess_text (avec Stemming)...")
# S'assurer que nltk est importé avant d'utiliser ses modules
import nltk
from nltk.corpus import stopwords
# --- Utilisation de PorterStemmer ---
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
# --- Création de l'objet Stemmer ---
stemmer_obj_global = PorterStemmer()
def preprocess_text(text):
    """Applique tokenisation, minuscules, suppression ponctuation/non-alpha, stop words ET STEMMING (Porter)."""
    if not isinstance(text, str): return ""
    try:
        tokens = word_tokenize(text.lower())
    except LookupError as e_tok: # Gestion erreur si ressource NLTK manque
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]
              print(f"--- Tokenizer a besoin de '{resource_name}', tentative téléchargement ---")
              try:
                  nltk.download(resource_name, quiet=True)
                  print(f"--- Ressource '{resource_name}' téléchargée, nouvelle tentative de tokenisation ---")
                  tokens = word_tokenize(text.lower()) # Retenter après téléchargement
              except Exception as e_dl_tok:
                  print(f"--- Échec du téléchargement de '{resource_name}': {e_dl_tok} ---")
                  raise e_tok # Relancer l'erreur originale si le téléchargement échoue
         else:
              raise e_tok # Relancer si ce n'est pas une ressource manquante connue
    except Exception as e_tok_other:
         print(f"Erreur inattendue dans word_tokenize: {e_tok_other}")
         raise e_tok_other
    # --- Application du Stemmer ---
    filtered_tokens = [stemmer_obj_global.stem(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie avec PorterStemmer.")
# Tester la nouvelle fonction
sample_text = "This is an example showing Information Retrieval with stemming and stop words removal."
stemmed_sample = preprocess_text(sample_text)
print(f"  Exemple Stemmed: {stemmed_sample}") # Doit afficher 'thi is exampl show inform retriev with stem and stop word remov.'

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    print(f"  Parsing des fichiers topics: {topic_files}")
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
# Mettre la création des dictionnaires prétraités dans un try-except
try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés.")
    print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes (avec stemming)...")
    # Appliquer la NOUVELLE fonction preprocess_text (avec stemming)
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries:
     print(f"\nERREUR lors du prétraitement des requêtes: {e_preproc_queries}")
     print("Les dictionnaires prétraités pourraient être incomplets ou vides.")
     # Créer des dictionnaires vides pour éviter NameError plus tard
     queries_short_preprocessed = {}
     queries_long_preprocessed = {}


# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète (avec Stemming) Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule 1: Extraire, Décompresser et Formater les Documents ===
# Lit AP.tar, décompresse les .gz internes, extrait <DOC>, <DOCNO>, <TEXT>
# et écrit le résultat dans ap_docs.jsonl.

import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    AP_TAR_PATH
    CORPUS_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.") # Devrait être ~275Mo

# Regex pour extraire les infos
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteurs
doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

try:
    # Ouvrir le fichier de sortie et l'archive TAR
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.") # Devrait être ~1051

        # Boucler sur chaque membre de l'archive
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer si ce n'est pas un fichier .gz ou .Z
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Réinitialiser pour chaque fichier

            try:
                # Extraire le contenu compressé
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # Décompresser le contenu
                    try:
                        content_bytes = gzip.decompress(compressed_content)
                        content = content_bytes.decode('utf-8', errors='ignore') # Décoder après décompression
                    except gzip.BadGzipFile: # Gérer si ce n'est pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au suivant

                    # Trouver tous les blocs <DOC> dans le contenu décompressé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches: continue # Passer si aucun doc trouvé

                    # Boucler sur chaque document trouvé
                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match: continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        # Nettoyer le texte (espaces multiples)
                        doc_text = ' '.join(text_match.group(1).strip().split()) if text_match else ""

                        # Écrire la ligne JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key: print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}"); skipped_members += 1
            except EOFError: print(f"\nAvertissement: Fin fichier inattendue {member.name}."); skipped_members += 1
            except Exception as e_extract: print(f"\nErreur extraction/lecture {member.name}: {e_extract}"); skipped_members += 1

except tarfile.ReadError as e_tar: print(f"\nERREUR lecture TAR {AP_TAR_PATH}: {e_tar}"); raise e_tar
except FileNotFoundError: print(f"\nERREUR: Fichier TAR {AP_TAR_PATH} non trouvé."); raise FileNotFoundError
except Exception as e_general: print(f"\nERREUR générale traitement TAR: {e_general}"); traceback.print_exc(); raise e_general

# Afficher le résumé de l'extraction
print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0: print(f"  {decompression_errors} erreurs/avertissements décompression.")
print(f"  {doc_count} documents écrits dans {JSONL_OUTPUT_PATH}") # Devrait être ~240k

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale {JSONL_OUTPUT_PATH}: {output_size} octets.") # Devrait être ~600Mo
    if output_size > 0 and doc_count > 0: print("  SUCCÈS: Fichier de sortie contient des données.")
    else: print("  ATTENTION: Fichier de sortie vide ou aucun document écrit.")
else: print(f"  ATTENTION: Fichier {JSONL_OUTPUT_PATH} non créé.")



# --- Nouvelle Cellule ---

# === Cellule 2: Indexation Baseline ===
# Crée l'index Lucene à partir de ap_docs.jsonl (sans prétraitement spécifique).

import os
import subprocess
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Début de l'indexation Baseline...")
print(f"Dossier source: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction ('extract_code_tar_gzip_fixed') a échoué.")

# Commande Pyserini
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Nombre de threads pour l'indexation
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options de stockage
]

print(f"Exécution: {' '.join(index_cmd_baseline)}")
try:
    # Exécuter la commande d'indexation
    # Augmenter le timeout car cela peut être long
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:]) # Afficher la fin de stdout
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si 0 document a été indexé (signe de problème)
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique 0 document indexé.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except Exception as e:
    # Gérer les erreurs potentielles
    print(f"\nERREUR pendant l'indexation Baseline: {e}")
    if isinstance(e, subprocess.CalledProcessError):
        print("Sortie STDOUT:\n", e.stdout)
        print("Sortie STDERR:\n", e.stderr)
    else:
        traceback.print_exc()
    raise e

# Vérifier la taille de l'index créé
print(f"\nVérification taille index: {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'" # Commande pour taille dossier
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille: {result_du.stdout.split()[0]}") # Afficher la taille
    except Exception as e_du:
        print(f"  Impossible de vérifier taille: {e_du}")
else:
    print("  ATTENTION: Dossier index non créé.")


# --- Nouvelle Cellule ---

# === Cellule 0: Configuration Complète (avec Stemming) ===
# Installe Java 21, configure comme défaut, installe outils build,
# pybind11, dernière Pyserini, NLTK+ressources, définit chemins,
# FONCTION preprocess_text AVEC STEMMING, parse topics.

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète (avec Stemming) ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try: subprocess.run(install_java_cmd, shell=True, check=True, timeout=180); print("OpenJDK 21 installé.")
except Exception as e: print(f"ERREUR installation Java 21: {e}"); raise

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try: subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True); subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True); print("update-alternatives configuré.")
    except Exception as e: print(f"ERREUR config update-alternatives: {e}")
else: print(f"ATTENTION: Chemin Java 21 non trouvé: {java_path_21}.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]): print(f"ATTENTION: Chemin JAVA_HOME inexistant.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try: subprocess.run(install_build_cmd, shell=True, check=True, timeout=180); print("Outils de build installés.")
except Exception as e_build: print(f"ERREUR installation outils de build: {e_build}")

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q"
try: subprocess.run(install_pybind_cmd, shell=True, check=True, timeout=60); print("pybind11 installé.")
except Exception as e_pybind: print(f"ERREUR installation pybind11: {e_pybind}")

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try: result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600); print("Paquets Python principaux installés.")
except Exception as e_pip: print(f"ERREUR installation pip: {e_pip}"); raise

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab'] # Liste corrigée
for resource in nltk_resources:
    try:
        if resource == 'punkt' or resource == 'punkt_tab': resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4': resource_path = f'corpora/{resource}.zip'
        elif resource == 'wordnet': resource_path = f'corpora/{resource}.zip'
        else: resource_path = f'corpora/{resource}.zip'
        nltk.data.find(resource_path)
    except LookupError:
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try: nltk.download(resource, quiet=True); print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download: print(f"  ERREUR téléchargement '{resource}': {e_download}")
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")
# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

if 'google.colab' in sys.modules:
    try: from google.colab import drive; drive.mount('/content/drive', force_remount=True); print("  Google Drive monté.")
    except Exception as e_mount: print(f"ATTENTION: Erreur montage Drive: {e_mount}")
if not os.path.exists(DRIVE_PROJECT_PATH): raise FileNotFoundError(f"Chemin Drive '{DRIVE_PROJECT_PATH}' inexistant.")
print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar"
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Sera recréé avec stemming
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True); os.makedirs(INDEX_DIR_BASELINE, exist_ok=True); os.makedirs(INDEX_DIR_PREPROC, exist_ok=True);
os.makedirs(CORPUS_DIR, exist_ok=True); os.makedirs(RUN_DIR, exist_ok=True); os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement (AVEC STEMMING) ---
print("\n[8/9] Définition de la fonction preprocess_text (avec Stemming)...")
import nltk
from nltk.corpus import stopwords
# --- Utilisation de PorterStemmer ---
from nltk.stem import PorterStemmer # Import du stemmer
from nltk.tokenize import word_tokenize
import string
stop_words_set_global = set(stopwords.words('english'))
# --- Création de l'objet Stemmer ---
stemmer_obj_global = PorterStemmer() # Création de l'objet
def preprocess_text(text):
    """Applique tokenisation, minuscules, suppression ponctuation/non-alpha, stop words ET STEMMING (Porter)."""
    if not isinstance(text, str): return ""
    try: tokens = word_tokenize(text.lower())
    except LookupError as e_tok: # Gestion erreur si ressource NLTK manque
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]; print(f"--- Tokenizer a besoin de '{resource_name}', tentative téléchargement ---")
              try: nltk.download(resource_name, quiet=True); print(f"--- Ressource '{resource_name}' téléchargée ---"); tokens = word_tokenize(text.lower())
              except Exception as e_dl_tok: print(f"--- Échec téléchargement '{resource_name}': {e_dl_tok} ---"); raise e_tok
         else: raise e_tok
    except Exception as e_tok_other: print(f"Erreur word_tokenize: {e_tok_other}"); raise e_tok_other
    # --- Application du Stemmer ---
    filtered_tokens = [stemmer_obj_global.stem(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie avec PorterStemmer.")
# Tester la nouvelle fonction
sample_text = "This is an example showing Information Retrieval with stemming and stop words removal."
stemmed_sample = preprocess_text(sample_text)
print(f"  Exemple Stemmed: {stemmed_sample}") # Doit afficher 'thi is exampl show inform retriev with stem and stop word remov.'

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
import re
import glob
def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {};
    try:
        with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
        for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
            topic_content = top_match.group(1)
            num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE); topic_id = num_match.group(1).strip() if num_match else None
            title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL); title = title_match.group(1).strip() if title_match else ""
            desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL); desc = desc_match.group(1).strip() if desc_match else ""
            if topic_id and title: topics[topic_id] = {'title': title, 'desc': desc}
    except Exception as e_topic: print(f"  ATTENTION: Erreur parsing {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR): print(f"ATTENTION: Dossier topics '{TOPICS_DIR}' inexistant."); topic_files = []
else: topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
all_topics = {}
if not topic_files: print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else: print(f"  Parsing fichiers topics: {topic_files}"); [all_topics.update(parse_topics(tf)) for tf in topic_files]

try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés."); print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes (avec stemming)...")
    # Appliquer la NOUVELLE fonction preprocess_text (avec stemming)
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries: print(f"\nERREUR prétraitement requêtes: {e_preproc_queries}"); queries_short_preprocessed, queries_long_preprocessed = {}, {}

# --- Vérification Finale Java ---
print("\n--- Vérification Finale Version Java Active ---")
try: result = subprocess.run("java -version", shell=True, check=True, capture_output=True, text=True, timeout=10); print("STDERR:\n", result.stderr); print("\nConfirmation: Java 21 OK." if "21." in result.stderr else "\nATTENTION: Java 21 NON ACTIF ?!")
except Exception as e: print(f"\nERREUR vérification Java: {e}")
# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale Version Pyserini Installée ---")
try: result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30); print(result_pyserini.stdout)
except Exception as e: print(f"ERREUR vérification Pyserini: {e}")

print("\n--- Configuration Complète (avec Stemming) Terminée ---")
print("\nPause..."); time.sleep(2); print("Prêt.")



# --- Nouvelle Cellule ---

# === Cellule 1: Extraire, Décompresser et Formater les Documents ===
# Lit AP.tar, décompresse les .gz internes, extrait <DOC>, <DOCNO>, <TEXT>
# et écrit le résultat dans ap_docs.jsonl.

import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    AP_TAR_PATH
    CORPUS_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.") # Devrait être ~275Mo

# Regex pour extraire les infos
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteurs
doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

try:
    # Ouvrir le fichier de sortie et l'archive TAR
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.") # Devrait être ~1051

        # Boucler sur chaque membre de l'archive
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer si ce n'est pas un fichier .gz ou .Z
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Réinitialiser pour chaque fichier

            try:
                # Extraire le contenu compressé
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # Décompresser le contenu
                    try:
                        content_bytes = gzip.decompress(compressed_content)
                        content = content_bytes.decode('utf-8', errors='ignore') # Décoder après décompression
                    except gzip.BadGzipFile: # Gérer si ce n'est pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au suivant

                    # Trouver tous les blocs <DOC> dans le contenu décompressé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches: continue # Passer si aucun doc trouvé

                    # Boucler sur chaque document trouvé
                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match: continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        # Nettoyer le texte (espaces multiples)
                        doc_text = ' '.join(text_match.group(1).strip().split()) if text_match else ""

                        # Écrire la ligne JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key: print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}"); skipped_members += 1
            except EOFError: print(f"\nAvertissement: Fin fichier inattendue {member.name}."); skipped_members += 1
            except Exception as e_extract: print(f"\nErreur extraction/lecture {member.name}: {e_extract}"); skipped_members += 1

except tarfile.ReadError as e_tar: print(f"\nERREUR lecture TAR {AP_TAR_PATH}: {e_tar}"); raise e_tar
except FileNotFoundError: print(f"\nERREUR: Fichier TAR {AP_TAR_PATH} non trouvé."); raise FileNotFoundError
except Exception as e_general: print(f"\nERREUR générale traitement TAR: {e_general}"); traceback.print_exc(); raise e_general

# Afficher le résumé de l'extraction
print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0: print(f"  {decompression_errors} erreurs/avertissements décompression.")
print(f"  {doc_count} documents écrits dans {JSONL_OUTPUT_PATH}") # Devrait être ~240k

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale {JSONL_OUTPUT_PATH}: {output_size} octets.") # Devrait être ~600Mo
    if output_size > 0 and doc_count > 0: print("  SUCCÈS: Fichier de sortie contient des données.")
    else: print("  ATTENTION: Fichier de sortie vide ou aucun document écrit.")
else: print(f"  ATTENTION: Fichier {JSONL_OUTPUT_PATH} non créé.")



# --- Nouvelle Cellule ---

# === Cellule 2: Indexation Baseline ===
# Crée l'index Lucene à partir de ap_docs.jsonl (sans prétraitement spécifique).

import os
import subprocess
import traceback

# Vérifier que les chemins sont définis (normalement fait par la cellule de config)
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Début de l'indexation Baseline...")
print(f"Dossier source: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction a échoué.")

# Commande Pyserini
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Nombre de threads pour l'indexation
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options de stockage
]

print(f"Exécution: {' '.join(index_cmd_baseline)}")
try:
    # Exécuter la commande d'indexation
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 min
    print("Sortie STDOUT (fin):\n", result.stdout[-1000:]) # Afficher la fin de stdout
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si 0 document a été indexé (signe de problème)
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique 0 document indexé.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except Exception as e:
    # Gérer les erreurs potentielles
    print(f"\nERREUR pendant l'indexation Baseline: {e}")
    if isinstance(e, subprocess.CalledProcessError):
        print("Sortie STDOUT:\n", e.stdout)
        print("Sortie STDERR:\n", e.stderr)
    else:
        traceback.print_exc()
    raise e

# Vérifier la taille de l'index créé
print(f"\nVérification taille index: {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'" # Commande pour taille dossier
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille: {result_du.stdout.split()[0]}") # Afficher la taille
    except Exception as e_du:
        print(f"  Impossible de vérifier taille: {e_du}")
else:
    print("  ATTENTION: Dossier index non créé.")


# --- Nouvelle Cellule ---

# === Cellule de Vérification du Contenu du Dossier Runs ===
# Utilise les commandes shell de Colab préfixées par '!'

# Chemin exact où les résultats de recherche sont attendus
# (Défini dans la cellule de configuration complète)
RUN_DIR_PATH="/content/ap_output/runs/"

# Utiliser '!' pour exécuter la commande shell 'echo'
print(f"Vérification du contenu de : {RUN_DIR_PATH}")

# Utiliser '!' pour exécuter la commande shell 'ls -l'
# Mettre le chemin entre guillemets pour gérer les espaces potentiels (même s'il n'y en a pas ici)
!ls -lh "{RUN_DIR_PATH}"



# --- Nouvelle Cellule ---

    # === Monter Google Drive ===
    from google.colab import drive
    import os

    try:
        print("Tentative de montage de Google Drive...")
        drive.mount('/content/drive', force_remount=True) # force_remount=True est utile en cas de problème antérieur

        # Vérifier si le point de montage de base existe après la tentative
        if os.path.exists('/content/drive/My Drive'):
            print("\nGoogle Drive monté avec succès sur /content/drive !")
        else:
            print("\nATTENTION: Le montage semble avoir échoué (vérifiez les messages ci-dessus et la fenêtre d'autorisation).")

    except Exception as e:
        print(f"\nUne erreur s'est produite lors du montage de Drive: {e}")



# --- Nouvelle Cellule ---

from multiprocessing import Pool
from google.colab import drive
import os

drive.mount("/content/drive")

# Create the directory if it doesn't exist
target_dir = "/content/drive/MyDrive/Projet_RI"  # Changed 'myDrive' to 'MyDrive'
if not os.path.exists(target_dir):
    try:
        os.makedirs(target_dir, exist_ok=True)  # Use exist_ok to avoid error if directory exists
        print(f"Directory '{target_dir}' created.")
    except FileExistsError:
        print(f"Directory '{target_dir}' already exists.")
else:
    print(f"Directory '{target_dir}' already exists.")

os.chdir(target_dir)

def process_file(file):
    # Votre code de prétraitement ici
    # Example: Assuming you want to read the file and return its content
    file_path = os.path.join("AP_Final", file) # Construct the full file path
    # Specify the encoding when opening the file
    with open(file_path, 'r', encoding='latin-1') as f:  # Try 'latin-1' or 'cp1252'
        preprocessed_text = f.read()  # Assign a value to preprocessed_text
    return preprocessed_text

if __name__ == "__main__":
    files = os.listdir("AP_Final")
    with Pool(os.cpu_count()) as p:  # Utilise tous les cœurs
        results = p.map(process_file, files)

# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (Séquentielle d'abord) ===
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc()) # Affiche la trace complète de l'erreur Java
                 raise # Arrête l'exécution pour ce run si la similarité ne peut être définie
        else:
            print("  Configuration BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante

        # Écrire les résultats dans le fichier de run TREC
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        # Important: Fermer le searcher pour libérer les ressources Java, même en cas d'erreur
        if searcher:
             try:
                 # Note: Pyserini ne semble pas avoir de méthode close() explicite sur LuceneSearcher
                 # La JVM devrait se nettoyer, mais c'est une bonne pratique si disponible
                 # searcher.close() # Décommentez si une telle méthode existe dans votre version
                 print(f"  Nettoyage implicite des ressources pour {run_tag}.")
                 pass
             except Exception as e_close:
                 print(f"  Erreur lors de la tentative de fermeture du searcher pour {run_tag}: {e_close}")


# --- Exécution des différentes configurations (en mode séquentiel) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel) sont terminées.")

# --- Note importante ---
# Si cette cellule s'exécute sans planter (même si c'est lent),
# le problème est probablement lié à la parallélisation (mémoire/conflits JVM).
# Si elle plante encore, surtout lors des runs 'tfidf',
# le problème pourrait être lié à ClassicSimilarity ou à l'environnement Java lui-même.


# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (BM25 Séquentiel Uniquement) ===
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
# On importe toujours ClassicSimilarity au cas où, mais on ne l'utilisera pas dans ce test
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    # --- Vérification ajoutée : Ne traiter que BM25 pour ce test ---
    if model != 'bm25':
        print(f"--- Run '{run_tag}' ignoré (Test BM25 uniquement) ---")
        return # Ne rien faire si ce n'est pas BM25

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité (seulement BM25 ici)
        print("  Configuration de BM25...")
        searcher.set_bm25(k1=0.9, b=0.4)
        print("  BM25 configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante

        # Écrire les résultats dans le fichier de run TREC
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        if searcher:
             try:
                 print(f"  Nettoyage implicite des ressources pour {run_tag}.")
                 pass
             except Exception as e_close:
                 print(f"  Erreur lors de la tentative de fermeture du searcher pour {run_tag}: {e_close}")


# --- Exécution des différentes configurations (BM25 seulement) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF (IGNORÉ DANS CETTE VERSION)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel, BM25 uniquement) sont terminées.")

# --- Note importante ---
# Si cette cellule s'exécute sans planter, le problème est très probablement lié
# à l'utilisation de ClassicSimilarity (TF-IDF) dans l'environnement Java actuel.
# Si elle plante encore, le problème est plus profond avec l'initialisation de LuceneSearcher.


# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (Séquentielle - BM25 & TF-IDF) ===
# Utilise Pyserini 0.23.0
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4) # Utilise les paramètres BM25 par défaut de Pyserini
            print("  BM25 configuré.")
        elif model == 'tfidf':
            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 # Tentative de configuration de ClassicSimilarity
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc()) # Affiche la trace complète de l'erreur Java
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return # Arrête l'exécution pour ce run spécifique si TF-IDF échoue
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return # Arrête l'exécution pour ce run spécifique
        else:
            # Sécurité : si le modèle n'est ni bm25 ni tfidf, utilise bm25 par défaut
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante même si une échoue

        # Écrire les résultats dans le fichier de run TREC (seulement si aucune erreur majeure n'est survenue avant la boucle)
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25 (peu probable maintenant)
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        # Nettoyage implicite (Pyserini gère la fermeture de la JVM)
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des différentes configurations (Séquentiel - BM25 & TF-IDF) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel - BM25 & TF-IDF tentative) sont terminées.")

# --- Note importante ---
# Surveillez la sortie lors de l'exécution des runs 'tfidf'.
# Si vous voyez des erreurs Java ou si le kernel plante à nouveau,
# cela signifie que ClassicSimilarity est toujours problématique.


# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
import os # Assurez-vous que os est importé

# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE SI NÉCESSAIRE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI/TREC/"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin.")

AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence") # Définit QRELS_DIR

# Chemins pour les sorties (index, résultats, etc.) dans l'environnement Colab
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus") # Pour les documents extraits/formatés
RUN_DIR = os.path.join(OUTPUT_DIR, "runs") # Définit RUN_DIR
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval") # Définit EVAL_DIR

# Créer les répertoires de sortie s'ils n'existent pas déjà
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

print(f"Chemin du projet Drive: {DRIVE_PROJECT_PATH}")
print(f"Répertoire de sortie Colab: {OUTPUT_DIR}")
print(f"Chemin Qrels: {QRELS_DIR}") # Vérifie que QRELS_DIR est défini
print(f"Chemin Runs: {RUN_DIR}")
print(f"Chemin Eval: {EVAL_DIR}")


# --- Nouvelle Cellule ---

# === Cellule 4.1 & 4.2: Préparation Qrels et Évaluation des Runs ===
import pandas as pd
import glob
import pytrec_eval
import os # Assurez-vous que os est importé
import traceback # Pour afficher les erreurs détaillées

# --- 4.1: Préparer le Fichier Qrels Combiné ---

# Chemins définis précédemment dans la Cellule 0.3 (qui vient d'être exécutée avec succès)
# QRELS_DIR, RUN_DIR, EVAL_DIR devraient être définis

print(f"Préparation des Qrels depuis: {QRELS_DIR}")
qrels_files = sorted(glob.glob(os.path.join(QRELS_DIR, "qrels.*.txt")))
if not qrels_files:
    print(f"ATTENTION: Aucun fichier Qrels trouvé dans {QRELS_DIR}. Vérifiez le chemin.")
else:
    print(f"Fichiers Qrels trouvés: {qrels_files}")

all_qrels_data = []
for qf in qrels_files:
    try:
        # Lire le fichier qrels: query_id unused doc_id relevance
        # S'assurer que les IDs sont lus comme des chaînes de caractères
        qrels_df = pd.read_csv(qf, sep='\s+', names=['query_id', 'unused', 'doc_id', 'relevance'],
                               dtype={'query_id': str, 'unused': str, 'doc_id': str, 'relevance': int})
        all_qrels_data.append(qrels_df[['query_id', 'doc_id', 'relevance']]) # Garder seulement les colonnes utiles
    except Exception as e:
        print(f"Erreur lors de la lecture du fichier Qrels {qf}: {e}")


if not all_qrels_data:
     print("ERREUR: Impossible de lire les données Qrels. Vérifiez les fichiers et les chemins.")
     # Arrêter ici si les qrels ne peuvent pas être chargés
     raise ValueError("Données Qrels non chargées.")
else:
    combined_qrels_df = pd.concat(all_qrels_data, ignore_index=True)

    # Convertir en dictionnaire format pytrec_eval: {query_id: {doc_id: relevance}}
    qrels_dict = {}
    for _, row in combined_qrels_df.iterrows():
        qid = row['query_id']
        did = row['doc_id']
        # Assurer que la pertinence est bien un entier
        try:
            rel = int(row['relevance'])
        except ValueError:
            print(f"Avertissement: Valeur de pertinence non entière ignorée pour qid={qid}, did={did}: {row['relevance']}")
            continue

        # Filtrer les jugements non binaires si nécessaire (garder 0 et 1, ou > 0 pour pertinent)
        if rel < 0: # Ignorer les jugements négatifs si présents
             continue

        if qid not in qrels_dict:
            qrels_dict[qid] = {}
        # Stocker la pertinence (pytrec_eval gère différents niveaux, mais ici 0=non pertinent, >0=pertinent)
        qrels_dict[qid][did] = rel

    print(f"Total de {len(qrels_dict)} requêtes avec jugements dans le fichier Qrels combiné.")
    qrels_doc_count = sum(len(docs) for docs in qrels_dict.values())
    print(f"Nombre total de jugements pertinents/non pertinents chargés: {qrels_doc_count}")


    # --- 4.2: Évaluation des Runs ---

    # Mesures à calculer (standard TREC)
    measures = {'map', 'P_10'} # MAP (mean average precision), Precision at 10

    # Initialiser l'évaluateur avec les qrels et les mesures
    # Utiliser seulement les query_ids présents dans les qrels pour l'évaluation
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, measures)

    # Trouver tous les fichiers de run générés dans RUN_DIR
    run_files = sorted(glob.glob(os.path.join(RUN_DIR, "*.txt")))
    print(f"\nFichiers de run à évaluer trouvés dans {RUN_DIR}: {len(run_files)}")
    # print(run_files) # Décommentez pour voir la liste exacte

    results_summary = [] # Pour stocker les résultats pour le tableau final

    if not run_files:
        print(f"ATTENTION: Aucun fichier de run (.txt) trouvé dans {RUN_DIR}. Vérifiez que l'étape 3 a bien généré des fichiers.")
    else:
        for run_file in run_files:
            run_name = os.path.basename(run_file)
            print(f"\n--- Évaluation de: {run_name} ---")

            # Charger le fichier de run au format TREC
            # pytrec_eval attend un dictionnaire: {query_id: {doc_id: score}}
            run_dict = {}
            line_count = 0
            error_count = 0
            try:
                with open(run_file, 'r') as f_run:
                    for line in f_run:
                        line_count += 1
                        parts = line.strip().split()
                        if len(parts) != 6:
                             # print(f"Ligne mal formatée ignorée dans {run_name} (ligne {line_count}): {line.strip()}")
                             error_count += 1
                             continue
                        qid, _, did, rank, score, _ = parts
                        # Assurer que l'ID de requête est une chaîne, comme dans qrels_dict
                        qid = str(qid)
                        # Assurer que le score est un float
                        try:
                            score = float(score)
                        except ValueError:
                            # print(f"Score non flottant ignoré dans {run_name} (ligne {line_count}): {score}")
                            error_count += 1
                            continue

                        if qid not in run_dict:
                            run_dict[qid] = {}
                        run_dict[qid][did] = score

                if error_count > 0:
                    print(f"  Avertissement: {error_count} lignes mal formatées ignorées dans {run_name}.")

                # Filtrer le run_dict pour ne garder que les query_ids présents dans qrels_dict
                filtered_run_dict = {qid: docs for qid, docs in run_dict.items() if qid in qrels_dict}
                ignored_queries = len(run_dict) - len(filtered_run_dict)
                if ignored_queries > 0:
                    print(f"  Avertissement: {ignored_queries} requêtes du run ignorées car absentes des Qrels.")

                if not filtered_run_dict:
                     print("  Erreur: Aucune requête du run ne correspond aux Qrels. Impossible d'évaluer.")
                     continue

                # Effectuer l'évaluation sur les données filtrées
                eval_results = evaluator.evaluate(filtered_run_dict)

                # Calculer les moyennes sur toutes les requêtes évaluées
                # Gérer le cas où une métrique pourrait manquer pour une requête (peu probable avec MAP, P@10)
                all_maps = [q_res.get("map", 0) for q_res in eval_results.values()]
                all_p10s = [q_res.get("P_10", 0) for q_res in eval_results.values()]

                # Éviter la division par zéro si aucune requête n'a pu être évaluée
                avg_map = sum(all_maps) / len(all_maps) if all_maps else 0
                avg_p10 = sum(all_p10s) / len(all_p10s) if all_p10s else 0

                print(f"  MAP: {avg_map:.4f}")
                print(f"  P@10: {avg_p10:.4f}")
                print("-" * (20 + len(run_name)))

                # Extraire les informations pour le tableau récapitulatif
                # Gère les noms de fichiers comme 'baseline_short_bm25.txt' ou 'preproc_long_tfidf.txt'
                parts = run_name.replace('.txt','').split('_')
                if len(parts) >= 3:
                    index_type = parts[0] # baseline ou preproc
                    query_type = parts[1] # short ou long
                    model_type = parts[2] # bm25 ou tfidf
                    # Gérer le cas RM3 si on l'ajoute plus tard
                    if len(parts) > 3 and parts[3] == 'rm3':
                         model_type += "+RM3"

                    results_summary.append({
                        "Run Name": run_name,
                        "Index": index_type,
                        "Query Type": query_type.capitalize(), # Met la première lettre en majuscule
                        "Weighting Scheme": model_type.upper(), # Met en majuscules (BM25, TFIDF)
                        "MAP": avg_map,
                        "P@10": avg_p10
                    })
                else:
                     print(f"  Avertissement: Impossible de parser le nom du run '{run_name}' pour le résumé.")

            except FileNotFoundError:
                 print(f"  Erreur: Fichier run non trouvé: {run_file}")
            except Exception as e:
                 print(f"  Erreur lors de l'évaluation de {run_name}: {e}")
                 print(traceback.format_exc())

        # Afficher le tableau récapitulatif si des résultats ont été collectés
        if results_summary:
            print("\n\n=== Tableau Récapitulatif des Résultats (Partie 1) ===")
            results_df = pd.DataFrame(results_summary)

            # Pivoter pour obtenir le format demandé (plus ou moins)
            try:
                pivot_map = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='MAP')
                print("\n--- MAP (Moyenne des Précisions Moyennes) ---")
                print(pivot_map.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot_map:
                 print(f"\nErreur lors de la création du tableau pivot MAP: {e_pivot_map}")
                 print("Affichage du DataFrame brut MAP:")
                 print(results_df[['Query Type', 'Weighting Scheme', 'Index', 'MAP']].to_markdown(index=False, floatfmt=".4f"))


            try:
                pivot_p10 = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='P@10')
                print("\n--- P@10 (Précision aux 10 premiers documents) ---")
                print(pivot_p10.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot_p10:
                 print(f"\nErreur lors de la création du tableau pivot P@10: {e_pivot_p10}")
                 print("Affichage du DataFrame brut P@10:")
                 print(results_df[['Query Type', 'Weighting Scheme', 'Index', 'P@10']].to_markdown(index=False, floatfmt=".4f"))


            # Sauvegarder le DataFrame pour une utilisation ultérieure (ex: rapport)
            summary_file_path = os.path.join(EVAL_DIR, "evaluation_summary_part1.csv")
            try:
                 results_df.to_csv(summary_file_path, index=False)
                 print(f"\nTableau récapitulatif sauvegardé dans {summary_file_path}")
            except Exception as e_save:
                 print(f"\nErreur lors de la sauvegarde du résumé dans {summary_file_path}: {e_save}")

        else:
            print("\nAucun résultat d'évaluation à afficher ou sauvegarder.")



# --- Nouvelle Cellule ---

    # === Cellule de Vérification Java (à exécuter JUSTE AVANT la Cellule 5.1 / rm3_run_code) ===
    # Ceci vérifie quelle version de Java le kernel Python voit ACTUELLEMENT
    print("--- Vérification de la version Java vue par le kernel ACTUEL ---")
    !java -version
    print("-------------------------------------------------------------")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Chemins Corrigés) ===
import os
import sys
import subprocess
import time

print("--- Début de la Configuration Complète ---")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
subprocess.run(install_java_cmd, shell=True, check=True)
print("OpenJDK 21 installé.")

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
    subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
    print("update-alternatives configuré pour java.")
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11"
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        nltk.data.find(f'corpora/{resource}.zip') if resource != 'punkt' else nltk.data.find(f'tokenizers/{resource}.zip')
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! CHEMIN CORRIGÉ SELON VOS INDICATIONS !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # Chemin vers le sous-dossier TREC

# --- Le reste du code vérifie le chemin et définit les autres variables ---
if not os.path.exists(DRIVE_PROJECT_PATH):
    try:
        from google.colab import drive
        print("  Montage de Google Drive...")
        drive.mount('/content/drive', force_remount=True)
        if not os.path.exists(DRIVE_PROJECT_PATH):
             raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe TOUJOURS PAS après montage. Vérifiez le chemin exact et le nom des dossiers.")
    except ModuleNotFoundError:
         raise FileNotFoundError(f"Google Colab Drive non trouvé et chemin '{DRIVE_PROJECT_PATH}' inexistant.")
    except Exception as e_mount:
         raise FileNotFoundError(f"Erreur lors du montage de Drive ou chemin '{DRIVE_PROJECT_PATH}' inexistant: {e_mount}")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

# !!! NOM DE FICHIER CORRIGÉ SELON VOS INDICATIONS !!!
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Utilise AP.tar au lieu de AP.tar.gz
# Note: Pensez à modifier la Cellule 0.4 (extraction) pour ouvrir avec "r:" au lieu de "r:gz"

TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")


# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
import glob
import re
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics
topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")


# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR:\n", result.stderr) # Version souvent sur stderr
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")


# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except ImportError:
    print("ERREUR: Impossible d'importer Pyserini après l'installation.")
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")


print("\n--- Configuration Complète Terminée ---")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Chemins Corrigés) ===
# ... (début de la cellule inchangé : installation Java, build tools, pip, etc.) ...

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! VÉRIFIEZ CE CHEMIN VERS LE DOSSIER CONTENANT AP.tar !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # Est-ce le bon dossier ?

# --- Le reste du code vérifie le chemin et définit les autres variables ---
if not os.path.exists(DRIVE_PROJECT_PATH):
    try:
        from google.colab import drive
        print("  Montage de Google Drive...")
        drive.mount('/content/drive', force_remount=True)
        if not os.path.exists(DRIVE_PROJECT_PATH):
             raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe TOUJOURS PAS après montage. Vérifiez le chemin exact et le nom des dossiers.")
    except ModuleNotFoundError:
         raise FileNotFoundError(f"Google Colab Drive non trouvé et chemin '{DRIVE_PROJECT_PATH}' inexistant.")
    except Exception as e_mount:
         raise FileNotFoundError(f"Erreur lors du montage de Drive ou chemin '{DRIVE_PROJECT_PATH}' inexistant: {e_mount}")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

# !!! VÉRIFIEZ CE NOM DE FICHIER EXACT !!!
AP_TAR_FILENAME = "AP.tar" # Est-ce bien 'AP.tar' ? Ou 'ap.tar' ? Autre ?
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
# Note: Pensez à modifier la Cellule 0.4 (extraction) pour ouvrir avec "r:" au lieu de "r:gz"

TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes") # Ces sous-dossiers existent-ils ?
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence") # Ces sous-dossiers existent-ils ?
OUTPUT_DIR = "/content/ap_output"
# ... (définition des autres chemins inchangée) ...
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}") # Affiche le chemin complet qui sera vérifié
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# ... (reste de la cellule inchangé : définition preprocess_text, parsing topics, vérifications finales) ...



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire et Formater les Documents depuis AP.tar ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression
import os # Assurer que os est importé
import traceback # Pour afficher les erreurs

# Chemins définis dans la cellule précédente (combined_setup_paths_fixed)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe (devrait être bon maintenant, mais double vérification)
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé. Vérifiez le chemin et le nom du fichier dans la cellule de configuration.")

# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0

# Ouvrir/créer le fichier JSONL de sortie
# Utiliser le mode "r:" pour un fichier .tar non compressé
try:
    # Utiliser encoding='utf-8' pour l'écriture du fichier JSONL
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"  {len(members)} membres trouvés dans l'archive TAR.")
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile():
                skipped_members += 1
                continue

            file_read_count += 1
            # Extraire le contenu du fichier
            try:
                f = tar.extractfile(member)
                if f: # S'assurer que l'extraction a réussi
                    # Lire et décoder avec gestion des erreurs
                    content = f.read().decode('utf-8', errors='ignore')
                    f.close()

                    # Trouver tous les documents (<DOC>...</DOC>) dans le fichier actuel
                    for doc_match in doc_pattern.finditer(content):
                        doc_content = doc_match.group(1)

                        # Extraire DOCNO
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        # Extraire TEXT
                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split()) # Nettoyage espaces
                        else:
                            doc_text = ""

                        # Écrire l'entrée JSONL
                        try:
                            json_line = json.dumps({"id": doc_id, "contents": doc_text})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                # Peut arriver si le membre est listé mais inaccessible
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Est-il corrompu ou n'est-ce pas un fichier TAR valide? Erreur: {e_tar}")
    raise e_tar # Arrêter si le TAR est illisible
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé au moment de l'ouverture.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement du fichier TAR: {e_general}")
     traceback.print_exc()
     raise e_general


print(f"\nTraitement terminé.")
print(f"  {file_read_count} fichiers lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés (dossiers ou erreurs).")
print(f"  {doc_count} documents formatés et écrits dans {JSONL_OUTPUT_PATH}")
if doc_count < 100000: # Seuil arbitraire pour AP
     print("  ATTENTION: Le nombre de documents extraits semble faible. Vérifiez le fichier TAR et les regex.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
print(f"Collection source: {JSONL_OUTPUT_PATH}")
print(f"Répertoire de l'index: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# -input: dossier contenant les fichiers JSONL (ici CORPUS_DIR)
# -collection: type de collection (JsonCollection pour nos fichiers .jsonl)
# -generator: comment traiter les fichiers (DefaultLuceneDocumentGenerator crée un document Lucene par ligne JSON)
# -index: chemin où sauvegarder l'index
# -threads: nombre de threads à utiliser (ajustez si besoin, 4 est raisonnable pour Colab)
# -storePositions -storeDocvectors -storeRaw: stocke informations supplémentaires utiles
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw"
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    # Vous pouvez décider de lever l'erreur pour arrêter ou juste afficher un message
    # raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    # raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    import traceback
    traceback.print_exc()
    # raise e



# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
import json
from tqdm.notebook import tqdm
import os
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
# CORPUS_DIR

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# S'assurer que la fonction preprocess_text est définie (normalement fait dans la cellule de setup)
if 'preprocess_text' not in globals():
    print("Erreur: La fonction 'preprocess_text' n'est pas définie. Ré-exécutez la cellule de configuration.")
    # Optionnel: Redéfinir ici si nécessaire (copier depuis la cellule de setup)
    raise NameError("preprocess_text non définie")
else:
    doc_count_preproc = 0
    error_count = 0
    # Lire le fichier JSONL original et écrire le fichier prétraité
    try:
        # Utiliser utf-8 pour lire et écrire
        with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
             open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:

            # Itérer sur le fichier d'entrée
            # Utiliser tqdm pour la barre de progression
            for line in tqdm(infile, desc="Prétraitement des documents"):
                try:
                    data = json.loads(line)
                    # Utiliser .get pour la robustesse si 'id' ou 'contents' manque
                    doc_id = data.get('id', None)
                    original_contents = data.get('contents', '')

                    if doc_id is None:
                        # print("Avertissement: Ligne JSON sans 'id', ignorée.")
                        error_count += 1
                        continue

                    # Appliquer le prétraitement
                    preprocessed_contents = preprocess_text(original_contents)

                    # Écrire la nouvelle ligne JSONL
                    # S'assurer que l'ID est une chaîne et le contenu aussi
                    json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                    outfile.write(json_line + '\n')
                    doc_count_preproc += 1

                except json.JSONDecodeError:
                    # print(f"Avertissement: Erreur de décodage JSON sur une ligne, ignorée.")
                    error_count += 1
                except Exception as e_line:
                    print(f"\nErreur inattendue lors du prétraitement d'une ligne (id={data.get('id', 'inconnu')}): {e_line}")
                    error_count += 1
                    # Optionnel: Afficher la trace pour débugger des erreurs spécifiques
                    # traceback.print_exc()


        print(f"\nTerminé.")
        print(f"  {doc_count_preproc} documents prétraités et écrits dans {JSONL_PREPROC_PATH}")
        if error_count > 0:
             print(f"  {error_count} lignes ignorées à cause d'erreurs.")

    except FileNotFoundError:
        print(f"ERREUR: Le fichier d'entrée {JSONL_OUTPUT_PATH} n'a pas été trouvé.")
        raise
    except Exception as e_main:
        print(f"ERREUR générale lors de la préparation des données prétraitées: {e_main}")
        traceback.print_exc()
        raise



# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source
# INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Dossier cible pour l'index
# CORPUS_DIR contient le fichier JSONL prétraité

print(f"Début de l'indexation avec Prétraitement...")
# Note: Pyserini s'attend à un dossier en entrée pour JsonCollection,
# il trouvera ap_docs_preprocessed.jsonl dans CORPUS_DIR.
print(f"Collection source (dossier): {CORPUS_DIR}")
print(f"Fichier JSONL prétraité attendu: {JSONL_PREPROC_PATH}")
print(f"Répertoire de l'index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe
if not os.path.exists(JSONL_PREPROC_PATH):
    raise FileNotFoundError(f"Le fichier de données prétraitées {JSONL_PREPROC_PATH} n'a pas été trouvé. Assurez-vous que l'étape précédente (1.3) s'est bien terminée.")

# Commande Pyserini pour l'indexation prétraitée
index_cmd_preproc = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR, # Pointeur vers le dossier contenant les jsonl
    "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Important: Indique que le texte est déjà tokenisé/traité
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_preproc)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion
    result = subprocess.run(index_cmd_preproc, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Prétraitée a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Prétraitée a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Prétraitée: {e}")
    traceback.print_exc()
    raise e



# --- Nouvelle Cellule ---

# === Cellule 3.1: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Utilise la dernière Pyserini et Java 21
# Assurez-vous que les variables d'index et de requêtes sont définies par la cellule de config
# INDEX_DIR_BASELINE, INDEX_DIR_PREPROC
# queries_short, queries_long, queries_short_preprocessed, queries_long_preprocessed
# K_RESULTS devrait aussi être défini (sinon, on le mettra à 1000)

from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées
import os # Assurer que os est importé
from jnius import autoclass, JavaException # Importer pour TF-IDF

# Essayer de définir K_RESULTS si ce n'est pas déjà fait
try:
    K_RESULTS
except NameError:
    print("Définition de K_RESULTS (nombre de résultats) à 1000...")
    K_RESULTS = 1000

# --- Configuration des modèles de similarité ---
# Charger la classe Java pour TF-IDF (ClassicSimilarity)
# Mettre dans un try-except au cas où l'import échouerait encore (peu probable avec Java 21)
try:
    ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
    print("Classe ClassicSimilarity (pour TF-IDF) chargée avec succès.")
except JavaException as e_load_class:
    print(f"ERREUR Java lors du chargement de ClassicSimilarity: {e_load_class}")
    print("Les recherches TF-IDF échoueront probablement.")
    ClassicSimilarity = None # Mettre à None pour pouvoir vérifier plus tard
except Exception as e_load_gen:
     print(f"ERREUR inattendue lors du chargement de ClassicSimilarity: {e_load_gen}")
     ClassicSimilarity = None


def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        # Assurer que LuceneSearcher est importé
        from pyserini.search.lucene import LuceneSearcher
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None:
                 print("ERREUR: Classe ClassicSimilarity non chargée. Impossible de configurer TF-IDF.")
                 print(f"--- ABANDON du run {run_tag} ---")
                 return # Ne pas continuer si la classe n'a pas pu être chargée

            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return
        else:
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # S'assurer que preprocess_text est défini
                if 'preprocess_text' not in globals():
                     raise NameError("La fonction preprocess_text n'est pas définie.")

                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                # Compter les erreurs par requête mais continuer
                query_errors += 1
                if query_errors < 10: # Limiter l'affichage des erreurs par requête
                     print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                elif query_errors == 10:
                     print("\nPlusieurs erreurs de recherche pour ce run, messages suivants masqués...")


        # Écrire les résultats dans le fichier de run TREC
        if all_results_list:
             with open(output_run_file, 'w', encoding='utf-8') as f_out:
                f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes de résultats écrites.")
        else:
            print("\n  Avertissement: Aucun résultat généré pour ce run.")

        if query_errors > 0:
            print(f"  Avertissement: {query_errors} erreurs rencontrées lors de la recherche sur les requêtes individuelles.")

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc())
    finally:
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des 8 configurations de recherche (Séquentiel) ---

print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Tout-en-un) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Essayer de trouver la ressource pour éviter le re-téléchargement inutile
        if resource == 'punkt':
            nltk.data.find(f'tokenizers/{resource}.zip')
        elif resource == 'omw-1.4':
             nltk.data.find(f'corpora/{resource}.zip')
        else:
            nltk.data.find(f'corpora/{resource}.zip')
        # print(f"  Ressource NLTK '{resource}' déjà présente.")
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
stop_words_set = set(stopwords.words('english'))
lemmatizer_obj = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire et Formater les Documents depuis AP.tar ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression
import os # Assurer que os est importé
import traceback # Pour afficher les erreurs

# Chemins définis dans la cellule précédente (full_setup_code)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe (devrait être bon maintenant)
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé. Vérifiez le chemin et le nom du fichier dans la cellule de configuration.")

# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0

# Ouvrir/créer le fichier JSONL de sortie
# Utiliser le mode "r:" pour un fichier .tar non compressé
try:
    # Utiliser encoding='utf-8' pour l'écriture du fichier JSONL
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"  {len(members)} membres trouvés dans l'archive TAR.")
        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile():
                skipped_members += 1
                continue

            file_read_count += 1
            # Extraire le contenu du fichier
            try:
                f = tar.extractfile(member)
                if f: # S'assurer que l'extraction a réussi
                    # Lire et décoder avec gestion des erreurs
                    content = f.read().decode('utf-8', errors='ignore')
                    f.close()

                    # Trouver tous les documents (<DOC>...</DOC>) dans le fichier actuel
                    for doc_match in doc_pattern.finditer(content):
                        doc_content = doc_match.group(1)

                        # Extraire DOCNO
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        # Extraire TEXT
                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split()) # Nettoyage espaces
                        else:
                            doc_text = ""

                        # Écrire l'entrée JSONL
                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)}) # Assurer str
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                # Peut arriver si le membre est listé mais inaccessible
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Est-il corrompu ou n'est-ce pas un fichier TAR valide? Erreur: {e_tar}")
    raise e_tar # Arrêter si le TAR est illisible
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé au moment de l'ouverture.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement du fichier TAR: {e_general}")
     traceback.print_exc()
     raise e_general


print(f"\nTraitement terminé.")
print(f"  {file_read_count} fichiers lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés (dossiers ou erreurs).")
print(f"  {doc_count} documents formatés et écrits dans {JSONL_OUTPUT_PATH}")
if doc_count < 100000: # Seuil arbitraire pour AP
     print("  ATTENTION: Le nombre de documents extraits semble faible. Vérifiez le fichier TAR et les regex.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e



# --- Nouvelle Cellule ---

# === Cellule de Vérification et Nettoyage du Corpus ===
import os
import subprocess

print("--- Vérification du contenu du dossier Corpus ---")

# Redéfinir CORPUS_DIR au cas où (normalement défini dans la config)
OUTPUT_DIR = "/content/ap_output"
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")

# Vérifier si le dossier CORPUS_DIR existe
if not os.path.exists(CORPUS_DIR):
    print(f"ERREUR: Le dossier {CORPUS_DIR} n'existe pas. L'étape d'extraction a peut-être échoué.")
else:
    print(f"Contenu du dossier : {CORPUS_DIR}")
    # Utiliser !ls pour lister le contenu
    !ls -lh {CORPUS_DIR}
    print("-" * 30)

    print("\n--- Vérification du format de ap_docs.jsonl ---")
    jsonl_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
    if not os.path.exists(jsonl_path):
        print(f"ERREUR: Le fichier {jsonl_path} n'existe pas. L'étape d'extraction a échoué.")
    else:
        print(f"Affichage des 3 premières lignes de : {jsonl_path}")
        # Utiliser !head pour afficher les premières lignes
        !head -n 3 {jsonl_path}
        print("-" * 30)

    print("\n--- Vérification et Nettoyage potentiel ---")
    preproc_jsonl_path = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")
    if os.path.exists(preproc_jsonl_path):
        print(f"Le fichier prétraité {preproc_jsonl_path} existe.")
        print("Il va être supprimé pour éviter les interférences avec l'indexation baseline.")
        try:
            # Utiliser !rm pour supprimer le fichier
            rm_cmd = f"rm '{preproc_jsonl_path}'" # Mettre des guillemets au cas où il y aurait des espaces
            print(f"Exécution de : {rm_cmd}")
            subprocess.run(rm_cmd, shell=True, check=True, capture_output=True, text=True)
            print(f"Fichier {preproc_jsonl_path} supprimé avec succès.")
            # Vérifier à nouveau le contenu du dossier
            print("\nNouveau contenu du dossier :")
            !ls -lh {CORPUS_DIR}
        except subprocess.CalledProcessError as e:
            print(f"ERREUR lors de la suppression de {preproc_jsonl_path}: {e}")
            print("Sortie STDERR:", e.stderr)
        except Exception as e:
            print(f"ERREUR inattendue lors de la suppression: {e}")
    else:
        print(f"Le fichier prétraité {preproc_jsonl_path} n'existe pas. Aucun nettoyage nécessaire.")
    print("-" * 30)

print("\n--- Vérification et Nettoyage Terminés ---")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire et Formater les Documents depuis AP.tar (Avec Debug) ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression
import os # Assurer que os est importé
import traceback # Pour afficher les erreurs

# Chemins définis précédemment
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")
print("--- AJOUT DE DEBUG ---")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé. Vérifiez le chemin et le nom du fichier.")
else:
    # Afficher la taille du fichier TAR
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")
    if tar_size < 1024 * 1024: # Moins de 1 Mo, suspect pour AP
        print("  ATTENTION: La taille du fichier TAR semble très petite !")


# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
docs_found_in_files = 0
first_doc_id_found = None
first_doc_text_sample = None

# Ouvrir/créer le fichier JSONL de sortie
# Utiliser le mode "r:" pour un fichier .tar non compressé
try:
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\nDEBUG: {len(members)} membres trouvés dans l'archive TAR.")
        if not members:
             print("ATTENTION: Aucun membre trouvé dans l'archive TAR. Le fichier est peut-être vide ou corrompu.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            if not member.isfile():
                skipped_members += 1
                # print(f"DEBUG: Membre ignoré (pas un fichier): {member.name}")
                continue

            file_read_count += 1
            if file_read_count % 50 == 0: # Afficher un message tous les 50 fichiers lus
                 print(f"DEBUG: Lecture du fichier {file_read_count}/{len(members)}: {member.name}")

            try:
                f = tar.extractfile(member)
                if f:
                    content = f.read().decode('utf-8', errors='ignore')
                    f.close()

                    # DEBUG: Vérifier si des balises <DOC> sont trouvées
                    doc_matches = doc_pattern.findall(content)
                    num_docs_in_file = len(doc_matches)
                    if num_docs_in_file > 0:
                        docs_found_in_files += 1
                        # print(f"DEBUG: Trouvé {num_docs_in_file} <DOC> dans {member.name}")
                    # elif file_read_count <= 10: # Afficher pour les 10 premiers fichiers si aucun doc trouvé
                         # print(f"DEBUG: Trouvé 0 <DOC> dans {member.name}")


                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        # DEBUG: Sauvegarder le premier ID et extrait de texte trouvés
                        if first_doc_id_found is None:
                            first_doc_id_found = doc_id
                            first_doc_text_sample = doc_text[:100] + "..." # Extrait

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Est-il corrompu ou n'est-ce pas un fichier TAR valide? Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé au moment de l'ouverture.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement du fichier TAR: {e_general}")
     traceback.print_exc()
     raise e_general


print(f"\n--- Fin de l'Extraction (Avec Debug) ---")
print(f"  {file_read_count} fichiers lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés.")
print(f"  {docs_found_in_files} fichiers contenaient au moins une balise <DOC>.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")
if first_doc_id_found:
    print(f"  Premier Doc ID trouvé: {first_doc_id_found}")
    print(f"  Extrait du premier texte: {first_doc_text_sample}")
else:
    print("  Aucun document avec ID et Texte n'a été trouvé/extrait.")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait ! Vérifiez les regex ou la structure interne des fichiers dans AP.tar. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire, Décompresser et Formater les Documents ===
import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Chemins définis précédemment
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")

# Regex (inchangées)
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

# Ouvrir/créer le fichier JSONL de sortie
try:
    # Utiliser encoding='utf-8' pour l'écriture
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            if not member.isfile() or not member.name.endswith(('.gz', '.Z')): # Traiter seulement les fichiers .gz ou .Z
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Initialiser content

            try:
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # --- AJOUT : Décompression Gzip ---
                    try:
                        # Décompresser le contenu lu
                        content_bytes = gzip.decompress(compressed_content)
                        # Décoder en texte APRES décompression
                        content = content_bytes.decode('utf-8', errors='ignore')
                    except gzip.BadGzipFile:
                        # print(f"Avertissement: Fichier {member.name} n'est pas un fichier gzip valide, tentative de lecture directe.")
                        # Essayer de décoder directement si ce n'était pas du gzip (moins probable vu les noms)
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au fichier suivant si la décompression échoue
                    # --- FIN AJOUT ---

                    # Chercher les documents dans le contenu décompressé et décodé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches:
                         # Si aucun <DOC> trouvé, passer au membre suivant
                         continue

                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement: {e_general}")
     traceback.print_exc()
     raise e_general

print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés (pas .gz/.Z ou erreur lecture).")
if decompression_errors > 0:
    print(f"  {decompression_errors} erreurs de décompression rencontrées.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait même après tentative de décompression ! Vérifiez les regex ou la structure interne des fichiers décompressés. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")
    elif output_size > 0 and doc_count > 0:
         print("  SUCCÈS: Le fichier de sortie contient des données.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (maintenant non vide)
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction a échoué.")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé malgré un fichier source non vide. Problème potentiel.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")




# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
import json
from tqdm.notebook import tqdm
import os
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (non vide)
# CORPUS_DIR

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# S'assurer que la fonction preprocess_text est définie (normalement fait dans la cellule de setup)
if 'preprocess_text' not in globals():
    print("Erreur: La fonction 'preprocess_text' n'est pas définie. Ré-exécutez la cellule de configuration.")
    raise NameError("preprocess_text non définie")
else:
    doc_count_preproc = 0
    error_count = 0
    # Lire le fichier JSONL original et écrire le fichier prétraité
    try:
        # Utiliser utf-8 pour lire et écrire
        with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
             open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:

            # Itérer sur le fichier d'entrée
            # Utiliser tqdm pour la barre de progression
            for line in tqdm(infile, desc="Prétraitement des documents"):
                try:
                    data = json.loads(line)
                    # Utiliser .get pour la robustesse si 'id' ou 'contents' manque
                    doc_id = data.get('id', None)
                    original_contents = data.get('contents', '')

                    if doc_id is None:
                        error_count += 1
                        continue

                    # Appliquer le prétraitement
                    preprocessed_contents = preprocess_text(original_contents)

                    # Écrire la nouvelle ligne JSONL
                    json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                    outfile.write(json_line + '\n')
                    doc_count_preproc += 1

                except json.JSONDecodeError:
                    error_count += 1
                except Exception as e_line:
                    print(f"\nErreur inattendue lors du prétraitement d'une ligne (id={data.get('id', 'inconnu')}): {e_line}")
                    error_count += 1

        print(f"\nTerminé.")
        print(f"  {doc_count_preproc} documents prétraités et écrits dans {JSONL_PREPROC_PATH}")
        if error_count > 0:
             print(f"  {error_count} lignes ignorées à cause d'erreurs.")

        # Vérifier la taille du fichier de sortie
        if os.path.exists(JSONL_PREPROC_PATH):
            output_size = os.path.getsize(JSONL_PREPROC_PATH)
            print(f"  Taille finale de {JSONL_PREPROC_PATH}: {output_size} octets.")
            if output_size == 0 and doc_count_preproc > 0:
                 print("  ATTENTION: 0 octet écrit malgré le traitement de documents. Problème ?")
        else:
            print(f"  ATTENTION: Le fichier de sortie {JSONL_PREPROC_PATH} n'a pas été créé.")


    except FileNotFoundError:
        print(f"ERREUR: Le fichier d'entrée {JSONL_OUTPUT_PATH} n'a pas été trouvé.")
        raise
    except Exception as e_main:
        print(f"ERREUR générale lors de la préparation des données prétraitées: {e_main}")
        traceback.print_exc()
        raise



# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source
# INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Dossier cible pour l'index
# CORPUS_DIR contient le fichier JSONL prétraité

print(f"Début de l'indexation avec Prétraitement...")
# Note: Pyserini s'attend à un dossier en entrée pour JsonCollection,
# il trouvera ap_docs_preprocessed.jsonl dans CORPUS_DIR.
print(f"Collection source (dossier): {CORPUS_DIR}")
JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Chemin complet pour vérification
print(f"Fichier JSONL prétraité attendu: {JSONL_PREPROC_PATH}")
print(f"Répertoire de l'index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe et n'est pas vide
if not os.path.exists(JSONL_PREPROC_PATH) or os.path.getsize(JSONL_PREPROC_PATH) == 0:
    raise FileNotFoundError(f"Le fichier de données prétraitées {JSONL_PREPROC_PATH} est manquant ou vide. L'étape précédente (1.3) a échoué.")

# Commande Pyserini pour l'indexation prétraitée
index_cmd_preproc = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR, # Pointeur vers le dossier contenant les jsonl
    "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Important: Indique que le texte est déjà tokenisé/traité
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_preproc)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion
    result = subprocess.run(index_cmd_preproc, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé. Problème potentiel avec l'indexation prétraitée.")
    else:
        print(f"\nIndexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Prétraitée a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Prétraitée a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Prétraitée: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_PREPROC}...")
if os.path.exists(INDEX_DIR_PREPROC):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_PREPROC}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

# === Cellule 3.1: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Utilise la dernière Pyserini et Java 21
# Assurez-vous que les variables d'index et de requêtes sont définies par la cellule de config
# INDEX_DIR_BASELINE, INDEX_DIR_PREPROC
# queries_short, queries_long, queries_short_preprocessed, queries_long_preprocessed
# K_RESULTS devrait aussi être défini (sinon, on le mettra à 1000)

from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées
import os # Assurer que os est importé
from jnius import autoclass, JavaException # Importer pour TF-IDF

# Essayer de définir K_RESULTS si ce n'est pas déjà fait
try:
    K_RESULTS
except NameError:
    print("Définition de K_RESULTS (nombre de résultats) à 1000...")
    K_RESULTS = 1000

# --- Configuration des modèles de similarité ---
# Charger la classe Java pour TF-IDF (ClassicSimilarity)
# Mettre dans un try-except au cas où l'import échouerait (peu probable avec Java 21)
try:
    ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
    print("Classe ClassicSimilarity (pour TF-IDF) chargée avec succès.")
except JavaException as e_load_class:
    print(f"ERREUR Java lors du chargement de ClassicSimilarity: {e_load_class}")
    print("Les recherches TF-IDF échoueront probablement.")
    ClassicSimilarity = None # Mettre à None pour pouvoir vérifier plus tard
except Exception as e_load_gen:
     print(f"ERREUR inattendue lors du chargement de ClassicSimilarity: {e_load_gen}")
     ClassicSimilarity = None


def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        # Assurer que LuceneSearcher est importé
        from pyserini.search.lucene import LuceneSearcher
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None:
                 print("ERREUR: Classe ClassicSimilarity non chargée. Impossible de configurer TF-IDF.")
                 print(f"--- ABANDON du run {run_tag} ---")
                 return # Ne pas continuer si la classe n'a pas pu être chargée

            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return
        else:
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # S'assurer que preprocess_text est défini
                if 'preprocess_text' not in globals():
                     raise NameError("La fonction preprocess_text n'est pas définie.")

                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                # Vérifier si la requête traitée est vide
                if not search_text.strip():
                     # print(f"  Avertissement: Requête QID {query_id} est vide après traitement, ignorée.")
                     continue # Ignorer les requêtes vides

                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    # S'assurer que doc_id n'est pas None (peut arriver dans de rares cas)
                    if doc_id is None:
                        # print(f"  Avertissement: Doc ID est None pour QID {query_id} au rang {rank}, ignoré.")
                        continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                # Compter les erreurs par requête mais continuer
                query_errors += 1
                if query_errors < 10: # Limiter l'affichage des erreurs par requête
                     print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                elif query_errors == 10:
                     print("\nPlusieurs erreurs de recherche pour ce run, messages suivants masqués...")


        # Écrire les résultats dans le fichier de run TREC
        if all_results_list:
             # Utiliser encoding='utf-8' pour l'écriture
             with open(output_run_file, 'w', encoding='utf-8') as f_out:
                f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes de résultats écrites.")
        else:
            print("\n  Avertissement: Aucun résultat généré pour ce run.")

        if query_errors > 0:
            print(f"  Avertissement: {query_errors} erreurs rencontrées lors de la recherche sur les requêtes individuelles.")

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc())
    finally:
        # En théorie, Pyserini/jnius gère la fermeture de la JVM, pas besoin de fermer le searcher explicitement
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des 8 configurations de recherche (Séquentiel) ---

print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")


# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Tout-en-un) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Essayer de trouver la ressource pour éviter le re-téléchargement inutile
        if resource == 'punkt':
            nltk.data.find(f'tokenizers/{resource}.zip')
        elif resource == 'omw-1.4':
             nltk.data.find(f'corpora/{resource}.zip')
        else:
            nltk.data.find(f'corpora/{resource}.zip')
        # print(f"  Ressource NLTK '{resource}' déjà présente.")
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")


# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire, Décompresser et Formater les Documents ===
import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Chemins définis dans la cellule précédente (full_setup_code)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")

# Regex (inchangées)
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

# Ouvrir/créer le fichier JSONL de sortie
try:
    # Utiliser encoding='utf-8' pour l'écriture
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Traiter seulement les fichiers se terminant par .gz ou .Z (typique pour TREC)
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Initialiser content

            try:
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # --- AJOUT : Décompression Gzip ---
                    try:
                        # Décompresser le contenu lu
                        content_bytes = gzip.decompress(compressed_content)
                        # Décoder en texte APRES décompression
                        content = content_bytes.decode('utf-8', errors='ignore')
                    except gzip.BadGzipFile:
                        # print(f"Avertissement: Fichier {member.name} n'est pas un fichier gzip valide, tentative de lecture directe.")
                        # Essayer de décoder directement si ce n'était pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au fichier suivant si la décompression échoue
                    # --- FIN AJOUT ---

                    # Chercher les documents dans le contenu décompressé et décodé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches:
                         # Si aucun <DOC> trouvé, passer au membre suivant
                         continue

                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement: {e_general}")
     traceback.print_exc()
     raise e_general

print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0:
    print(f"  {decompression_errors} erreurs ou avertissements de décompression rencontrés.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait ! Vérifiez les regex ou la structure interne des fichiers décompressés. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")
    elif output_size > 0 and doc_count > 0:
         print("  SUCCÈS: Le fichier de sortie contient des données.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (maintenant non vide)
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction a échoué.")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé malgré un fichier source non vide. Problème potentiel.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

    # === Sauvegarde des fichiers générés vers Google Drive ===
    import os
    import subprocess

    # Redéfinir le chemin de base sur Drive (adaptez si nécessaire)
    # Assurez-vous que ce chemin pointe vers le dossier où vous voulez sauvegarder,
    # par exemple, le dossier Projet_RI
    # DRIVE_SAVE_BASE_PATH = "/content/drive/My Drive/Projet_RI" # Exemple
    # Ou utiliser le chemin du projet TREC si vous voulez sauvegarder dedans
    DRIVE_SAVE_BASE_PATH = DRIVE_PROJECT_PATH # Sauvegarde dans le dossier TREC

    # Chemin source dans Colab
    SOURCE_DIR = "/content/ap_output"

    # Chemin cible sur Google Drive
    # Crée un sous-dossier 'colab_output_backup' pour ne pas mélanger
    # avec vos fichiers originaux.
    TARGET_DIR_ON_DRIVE = os.path.join(DRIVE_SAVE_BASE_PATH, "colab_output_backup")

    print(f"Source à copier : {SOURCE_DIR}")
    print(f"Cible sur Drive : {TARGET_DIR_ON_DRIVE}")

    # Vérifier si le dossier source existe
    if os.path.exists(SOURCE_DIR):
        # Créer le dossier cible sur Drive s'il n'existe pas
        os.makedirs(TARGET_DIR_ON_DRIVE, exist_ok=True)
        print("\nCopie des fichiers en cours... (Cela peut prendre quelques minutes)")
        # Utiliser cp -r (récursif) et -v (verbeux)
        copy_cmd = f"cp -r -v '{SOURCE_DIR}/.' '{TARGET_DIR_ON_DRIVE}/'" # Copie le contenu de SOURCE_DIR
        try:
            # Utiliser subprocess pour voir la sortie en temps réel (peut être long)
            # Ou simplement !cp -r ...
            process = subprocess.Popen(copy_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = process.communicate()
            if process.returncode == 0:
                print("\nCopie terminée avec succès !")
                print(f"Les fichiers de {SOURCE_DIR} ont été copiés dans {TARGET_DIR_ON_DRIVE}")
            else:
                print(f"\nERREUR lors de la copie. Code de retour: {process.returncode}")
                print("STDOUT:", stdout.decode())
                print("STDERR:", stderr.decode())
        except Exception as e:
            print(f"\nERREUR inattendue lors de la copie: {e}")
    else:
        print(f"Le dossier source {SOURCE_DIR} n'existe pas, aucune copie effectuée.")



# --- Nouvelle Cellule ---

    # === Restauration des fichiers depuis Google Drive ===
    import os
    import subprocess
    import time

    # Chemin où les fichiers ont été sauvegardés sur Drive
    # (Doit correspondre au TARGET_DIR_ON_DRIVE de la cellule save_output_code)
    # Assurez-vous que DRIVE_PROJECT_PATH est défini par la cellule de setup précédente
    try:
        DRIVE_PROJECT_PATH
    except NameError:
        print("ERREUR: La variable DRIVE_PROJECT_PATH n'est pas définie. Exécutez d'abord la cellule de configuration complète.")
        # Optionnel: Redéfinir ici si nécessaire, mais il vaut mieux exécuter la cellule de setup
        # DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC"
        raise

    DRIVE_BACKUP_DIR = os.path.join(DRIVE_PROJECT_PATH, "colab_output_backup")

    # Chemin cible dans Colab (où Pyserini s'attend à les trouver)
    TARGET_RESTORE_DIR = "/content/ap_output"

    print(f"Source sur Drive : {DRIVE_BACKUP_DIR}")
    print(f"Cible dans Colab : {TARGET_RESTORE_DIR}")

    # Vérifier si le dossier de sauvegarde existe sur Drive
    if os.path.exists(DRIVE_BACKUP_DIR):
        # Créer le dossier cible dans Colab s'il n'existe pas
        # (La cellule de setup l'a peut-être déjà créé, mais `exist_ok=True` gère cela)
        os.makedirs(TARGET_RESTORE_DIR, exist_ok=True)

        print("\nRestauration des fichiers en cours... (Cela peut prendre quelques minutes)")
        # Utiliser cp -r (récursif) et -v (verbeux)
        # Copie le contenu de DRIVE_BACKUP_DIR dans TARGET_RESTORE_DIR
        # L'option -T peut être utile si TARGET_RESTORE_DIR existe déjà pour éviter de créer un sous-dossier
        # Mais copier le contenu avec '/.' est généralement plus sûr.
        copy_cmd = f"cp -r -v '{DRIVE_BACKUP_DIR}/.' '{TARGET_RESTORE_DIR}/'"
        try:
            # Exécuter et attendre la fin
            process = subprocess.run(copy_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600) # Timeout 10 minutes
            # Afficher stdout/stderr peut être très long, afficher seulement si erreur?
            # print("STDOUT:", process.stdout)
            # print("STDERR:", process.stderr)
            print("\nRestauration terminée avec succès !")
            print(f"Les fichiers de {DRIVE_BACKUP_DIR} ont été copiés dans {TARGET_RESTORE_DIR}")
            # Vérifier le contenu restauré
            print("\nContenu du dossier restauré (partiel):")
            !ls -lR {TARGET_RESTORE_DIR} | head -n 20 # Afficher une partie du contenu
        except subprocess.CalledProcessError as e:
             print(f"\nERREUR lors de la restauration. Code de retour: {e.returncode}")
             print("STDOUT:", e.stdout)
             print("STDERR:", e.stderr)
             print("\nVérifiez que le dossier de sauvegarde existe et contient les bons fichiers/dossiers (corpus, indexes/baseline).")
             raise e
        except subprocess.TimeoutExpired as e:
            print(f"\nERREUR: La restauration a dépassé le délai d'attente.")
            raise e
        except Exception as e:
            print(f"\nERREUR inattendue lors de la restauration: {e}")
            raise e
    else:
        print(f"ERREUR: Le dossier de sauvegarde {DRIVE_BACKUP_DIR} n'existe pas sur Google Drive.")
        print("Impossible de restaurer les fichiers. Vous devrez relancer les étapes d'extraction et d'indexation baseline.")
        # Optionnel: lever une exception pour arrêter
        # raise FileNotFoundError(f"Dossier de sauvegarde non trouvé: {DRIVE_BACKUP_DIR}")



# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Pour Reprendre) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Essayer de trouver la ressource pour éviter le re-téléchargement inutile
        if resource == 'punkt':
            nltk.data.find(f'tokenizers/{resource}.zip')
        elif resource == 'omw-1.4':
             nltk.data.find(f'corpora/{resource}.zip')
        else:
            nltk.data.find(f'corpora/{resource}.zip')
        # print(f"  Ressource NLTK '{resource}' déjà présente.")
    except nltk.downloader.DownloadError:
        print(f"  Téléchargement de la ressource NLTK '{resource}'...")
        nltk.download(resource, quiet=True)
print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Correction NLTK) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
# S'assurer que nltk est importé
import nltk
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4']
for resource in nltk_resources:
    try:
        # Déterminer le chemin de recherche correct pour nltk.data.find
        if resource == 'punkt':
            resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4':
             resource_path = f'corpora/{resource}.zip' # Open Multilingual Wordnet
        elif resource == 'wordnet':
             resource_path = f'corpora/{resource}.zip'
        else: # stopwords, etc.
            resource_path = f'corpora/{resource}.zip'

        # Essayer de trouver la ressource
        nltk.data.find(resource_path)
        # print(f"  Ressource NLTK '{resource}' déjà présente.")

    # --- CORRECTION ICI: Utiliser except LookupError ---
    except LookupError:
    # --------------------------------------------------
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try:
            nltk.download(resource, quiet=True)
            print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download:
            # Capturer les erreurs potentielles de téléchargement (réseau, etc.)
            print(f"  ERREUR lors du téléchargement de '{resource}': {e_download}")
            # Optionnel: arrêter si une ressource critique manque
            # if resource in ['punkt', 'stopwords', 'wordnet']: raise

print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
# (Déjà fait plus haut, mais redondance sans danger)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
print(f"  {len(all_topics)} topics parsés.")
print(f"  {len(queries_short)} requêtes courtes créées.")

# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule de Configuration Complète (Correction NLTK punkt_tab) ===
# Réunit toutes les étapes de setup nécessaires

import os
import sys
import subprocess
import time
import glob
import re
import json
import nltk # Importer nltk ici pour la partie NLTK
from tqdm.notebook import tqdm # Assurer l'import pour les fonctions
import traceback # Pour afficher les erreurs

print("--- Début de la Configuration Complète ---")
print("Cela peut prendre plusieurs minutes...")

# --- Partie 1: Installation Java 21 et Configuration ---
print("\n[1/9] Installation de OpenJDK 21...")
install_java_cmd = "apt-get update -qq > /dev/null && apt-get install -y openjdk-21-jdk-headless -qq > /dev/null"
try:
    subprocess.run(install_java_cmd, shell=True, check=True, timeout=180)
    print("OpenJDK 21 installé.")
except Exception as e:
    print(f"ERREUR lors de l'installation de Java 21: {e}")
    raise # Arrêter si Java ne s'installe pas

print("\n[2/9] Configuration de Java 21 comme défaut via update-alternatives...")
java_path_21 = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path_21):
    try:
        subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path_21} 1", shell=True, check=True)
        subprocess.run(f"update-alternatives --set java {java_path_21}", shell=True, check=True)
        print("update-alternatives configuré pour java.")
    except Exception as e:
        print(f"ERREUR lors de la configuration de update-alternatives: {e}")
        # Continuer mais avertir
else:
    print(f"ATTENTION: Chemin Java 21 non trouvé à {java_path_21}. update-alternatives non configuré.")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
if not os.path.exists(os.environ["JAVA_HOME"]):
     print(f"ATTENTION: Le chemin JAVA_HOME '{os.environ['JAVA_HOME']}' n'existe pas.")

# --- Partie 2: Installation des outils de build C++ ---
print("\n[3/9] Installation des outils de build (build-essential, cmake)...")
install_build_cmd = "apt-get update -qq > /dev/null && apt-get install -y build-essential cmake -qq > /dev/null"
try:
    subprocess.run(install_build_cmd, shell=True, check=True, timeout=180)
    print("Outils de build installés.")
except Exception as e_build:
    print(f"ERREUR lors de l'installation des outils de build: {e_build}")
    # Continuer mais avertir

# --- Partie 3: Installation de pybind11 ---
print("\n[4/9] Installation de pybind11...")
install_pybind_cmd = f"{sys.executable} -m pip install pybind11 -q" # -q peut être enlevé si ça échoue
try:
    subprocess.run(install_pybind_cmd, shell=True, check=True, capture_output=True, text=True, timeout=60)
    print("pybind11 installé avec succès.")
except Exception as e_pybind:
    print(f"ERREUR lors de l'installation de pybind11: {e_pybind}")
    # Continuer mais avertir

# --- Partie 4: Installation des Paquets Python Principaux ---
print("\n[5/9] Installation de la DERNIÈRE Pyserini, NLTK, Pytrec_eval...")
# Installer sans -q pour voir les erreurs si ça se reproduit
install_pip_cmd = f"{sys.executable} -m pip install pyserini nltk pytrec_eval"
try:
    result_pip = subprocess.run(install_pip_cmd, shell=True, check=True, capture_output=True, text=True, timeout=600)
    print("Paquets Python principaux installés.")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: La commande pip principale a échoué avec le code {e.returncode}")
    print("Sortie STDOUT de Pip:\n", e.stdout)
    print("Sortie STDERR de Pip:\n", e.stderr)
    raise e # Arrêter si l'installation de pyserini échoue
except subprocess.TimeoutExpired as e:
    print("\nERREUR: La commande pip principale a dépassé le délai d'attente.")
    print("Sortie STDOUT de Pip (partielle):\n", e.stdout)
    print("Sortie STDERR de Pip (partielle):\n", e.stderr)
    raise e
except Exception as e_pip:
    print(f"\nERREUR inattendue lors de l'installation pip: {e_pip}")
    raise e_pip

# --- Partie 5: Téléchargement Ressources NLTK ---
print("\n[6/9] Téléchargement/Vérification des ressources NLTK...")
# S'assurer que nltk est importé
import nltk
# --- CORRECTION ICI: Ajout de 'punkt_tab' ---
nltk_resources = ['wordnet', 'stopwords', 'punkt', 'omw-1.4', 'punkt_tab']
# ---------------------------------------------
for resource in nltk_resources:
    try:
        # Déterminer le chemin de recherche correct pour nltk.data.find
        if resource == 'punkt' or resource == 'punkt_tab': # punkt_tab est aussi dans tokenizers
            resource_path = f'tokenizers/{resource}.zip'
        elif resource == 'omw-1.4':
             resource_path = f'corpora/{resource}.zip' # Open Multilingual Wordnet
        elif resource == 'wordnet':
             resource_path = f'corpora/{resource}.zip'
        else: # stopwords, etc.
            resource_path = f'corpora/{resource}.zip'

        # Essayer de trouver la ressource
        nltk.data.find(resource_path)
        # print(f"  Ressource NLTK '{resource}' déjà présente.")

    except LookupError:
        print(f"  Ressource NLTK '{resource}' non trouvée. Téléchargement...")
        try:
            nltk.download(resource, quiet=True)
            print(f"  Ressource '{resource}' téléchargée.")
        except Exception as e_download:
            # Capturer les erreurs potentielles de téléchargement (réseau, etc.)
            print(f"  ERREUR lors du téléchargement de '{resource}': {e_download}")
            # Optionnel: arrêter si une ressource critique manque
            # if resource in ['punkt', 'stopwords', 'wordnet']: raise

print("Ressources NLTK prêtes.")

# --- Partie 6: Définition des Chemins ---
print("\n[7/9] Définition des chemins...")

# !!! MODIFIEZ CETTE LIGNE AVEC LE CHEMIN CORRECT VERS VOTRE DOSSIER TREC !!!
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC" # <--- VÉRIFIEZ ET CORRIGEZ CE CHEMIN !

# --- Vérification et définition des autres chemins ---
if 'google.colab' in sys.modules:
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive/My Drive'):
             print("  Montage de Google Drive...")
             drive.mount('/content/drive', force_remount=True)
        else:
             print("  Google Drive déjà monté.")
    except ModuleNotFoundError:
         print("ATTENTION: Google Colab non détecté ou erreur d'import.")
    except Exception as e_mount:
         print(f"ATTENTION: Erreur lors du montage de Drive: {e_mount}")

if not os.path.exists(DRIVE_PROJECT_PATH):
     raise FileNotFoundError(f"Le chemin Drive '{DRIVE_PROJECT_PATH}' n'existe pas. Vérifiez le chemin exact et le nom des dossiers.")

print(f"  Chemin du projet Drive utilisé: {DRIVE_PROJECT_PATH}")

AP_TAR_FILENAME = "AP.tar" # Nom du fichier archive
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, AP_TAR_FILENAME)
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "Topics-requetes")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "jugements de pertinence")
OUTPUT_DIR = "/content/ap_output"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
print(f"  Chemin Fichier AP CIBLE: {AP_TAR_PATH}")
print(f"  Chemin Qrels: {QRELS_DIR}")
print(f"  Chemin Runs: {RUN_DIR}")

# --- Partie 7: Définition Fonction Prétraitement ---
print("\n[8/9] Définition de la fonction preprocess_text...")
# S'assurer que nltk est importé avant d'utiliser ses modules
# (Déjà fait plus haut, mais redondance sans danger)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
# Utiliser des noms de variables différents pour éviter conflits potentiels
stop_words_set_global = set(stopwords.words('english'))
lemmatizer_obj_global = WordNetLemmatizer()
def preprocess_text(text):
    if not isinstance(text, str): return ""
    # Utiliser les objets globaux définis ici
    # Mettre la tokenisation dans un try-except spécifique pour voir si c'est elle qui échoue
    try:
        tokens = word_tokenize(text.lower())
    except LookupError as e_tok:
         # Essayer de télécharger la ressource manquante si c'est une LookupError NLTK
         if 'Resource' in str(e_tok) and 'not found' in str(e_tok):
              resource_name = str(e_tok).split('Resource ')[1].split(' ')[0]
              print(f"--- Tokenizer a besoin de '{resource_name}', tentative de téléchargement ---")
              try:
                  nltk.download(resource_name, quiet=True)
                  print(f"--- Ressource '{resource_name}' téléchargée, nouvelle tentative de tokenisation ---")
                  tokens = word_tokenize(text.lower()) # Retenter après téléchargement
              except Exception as e_dl_tok:
                  print(f"--- Échec du téléchargement de '{resource_name}': {e_dl_tok} ---")
                  raise e_tok # Relancer l'erreur originale si le téléchargement échoue
         else:
              raise e_tok # Relancer si ce n'est pas une ressource manquante connue
    except Exception as e_tok_other:
         print(f"Erreur inattendue dans word_tokenize: {e_tok_other}")
         raise e_tok_other

    filtered_tokens = [lemmatizer_obj_global.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words_set_global]
    return ' '.join(filtered_tokens)
print("  Fonction preprocess_text définie.")

# --- Partie 8: Parsing des Topics ---
print("\n[9/9] Parsing des topics...")
# S'assurer que re et glob sont importés
import re
import glob
def parse_topics(file_path):
    topics = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match: continue
                topic_id = num_match.group(1).strip()
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                if topic_id and title:
                     topics[topic_id] = {'title': title, 'desc': desc}
    except FileNotFoundError:
        print(f"  ATTENTION: Fichier topic non trouvé: {file_path}")
    except Exception as e_topic:
        print(f"  ATTENTION: Erreur lors du parsing de {file_path}: {e_topic}")
    return topics

if not os.path.exists(TOPICS_DIR):
     print(f"ATTENTION: Le dossier des topics '{TOPICS_DIR}' n'existe pas.")
     topic_files = []
else:
    topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))

all_topics = {}
if not topic_files:
     print(f"  ATTENTION: Aucun fichier topic trouvé dans {TOPICS_DIR}")
else:
    for tf in topic_files:
        all_topics.update(parse_topics(tf))

# Définir les dictionnaires même s'ils sont vides pour éviter NameError plus tard
# Mettre la création des dictionnaires prétraités dans un try-except au cas où preprocess_text échouerait encore
try:
    queries_short = {qid: data['title'] for qid, data in all_topics.items()}
    queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()}
    print(f"  {len(all_topics)} topics parsés.")
    print(f"  {len(queries_short)} requêtes courtes brutes créées.")
    print(f"  Prétraitement des requêtes...")
    queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
    queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}
    print(f"  Prétraitement des requêtes terminé.")
except Exception as e_preproc_queries:
     print(f"\nERREUR lors du prétraitement des requêtes: {e_preproc_queries}")
     print("Les dictionnaires prétraités pourraient être incomplets ou vides.")
     # Créer des dictionnaires vides pour éviter NameError plus tard
     queries_short_preprocessed = {}
     queries_long_preprocessed = {}


# --- Vérification Finale Java ---
print("\n--- Vérification Finale de la Version Java Active ---")
java_check_cmd = "java -version"
try:
    result = subprocess.run(java_check_cmd, shell=True, check=True, capture_output=True, text=True, timeout=10)
    print("Sortie STDERR (contient souvent la version OpenJDK):\n", result.stderr)
    if "21." not in result.stderr and "21." not in result.stdout:
         print("\nATTENTION: Java 21 ne semble PAS être la version active !")
    else:
         print("\nConfirmation: Java 21 semble être la version active.")
except Exception as e:
    print(f"\nERREUR lors de la vérification Java: {e}")

# --- Vérification Finale Pyserini ---
print("\n--- Vérification Finale de la Version Pyserini Installée ---")
try:
    result_pyserini = subprocess.run(f"{sys.executable} -m pip show pyserini", shell=True, check=True, capture_output=True, text=True, timeout=30)
    print(result_pyserini.stdout)
except Exception as e_pyserini_check:
    print(f"ERREUR lors de la vérification de Pyserini: {e_pyserini_check}")

print("\n--- Configuration Complète Terminée ---")
# Ajouter un délai pour s'assurer que tout est stable avant la prochaine cellule
print("\nPause de 5 secondes...")
time.sleep(5)
print("Prêt pour la suite.")



# --- Nouvelle Cellule ---

# === Cellule 0.4 (Modifiée): Extraire, Décompresser et Formater les Documents ===
import tarfile
import re
import json
import gzip # Importer le module gzip
from tqdm.notebook import tqdm
import os
import traceback

# Chemins définis dans la cellule précédente (full_setup_code_punkt_tab_fixed)
# AP_TAR_PATH devrait pointer vers ".../AP.tar"
# CORPUS_DIR devrait être défini

JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction, Décompression et Formatage depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Vérifier si le fichier AP.tar existe
if not os.path.exists(AP_TAR_PATH):
    raise FileNotFoundError(f"Le fichier d'archive {AP_TAR_PATH} n'a pas été trouvé.")
else:
    tar_size = os.path.getsize(AP_TAR_PATH)
    print(f"  Taille du fichier {AP_TAR_PATH}: {tar_size} octets.")

# Regex (inchangées)
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

doc_count = 0
file_read_count = 0
skipped_members = 0
decompression_errors = 0

# Ouvrir/créer le fichier JSONL de sortie
try:
    # Utiliser encoding='utf-8' pour l'écriture
    with open(JSONL_OUTPUT_PATH, 'w', encoding='utf-8') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:
        members = tar.getmembers()
        print(f"\n{len(members)} membres trouvés dans l'archive TAR.")

        for member in tqdm(members, desc="Traitement des fichiers TAR"):
            # Traiter seulement les fichiers se terminant par .gz ou .Z (typique pour TREC)
            # Ignorer les dossiers ou les fichiers non réguliers
            if not member.isfile() or not member.name.lower().endswith(('.gz', '.z')):
                skipped_members += 1
                continue

            file_read_count += 1
            content = "" # Initialiser content

            try:
                f = tar.extractfile(member)
                if f:
                    compressed_content = f.read()
                    f.close()

                    # --- AJOUT : Décompression Gzip ---
                    try:
                        # Décompresser le contenu lu
                        content_bytes = gzip.decompress(compressed_content)
                        # Décoder en texte APRES décompression
                        content = content_bytes.decode('utf-8', errors='ignore')
                    except gzip.BadGzipFile:
                        # print(f"Avertissement: Fichier {member.name} n'est pas un fichier gzip valide, tentative de lecture directe.")
                        # Essayer de décoder directement si ce n'était pas du gzip
                        content = compressed_content.decode('utf-8', errors='ignore')
                        decompression_errors += 1
                    except Exception as e_gzip:
                         print(f"\nErreur de décompression pour {member.name}: {e_gzip}")
                         decompression_errors += 1
                         continue # Passer au fichier suivant si la décompression échoue
                    # --- FIN AJOUT ---

                    # Chercher les documents dans le contenu décompressé et décodé
                    doc_matches = doc_pattern.findall(content)
                    if not doc_matches:
                         # Si aucun <DOC> trouvé, passer au membre suivant
                         continue

                    for doc_content in doc_matches:
                        docno_match = docno_pattern.search(doc_content)
                        if not docno_match:
                            continue
                        doc_id = docno_match.group(1).strip()

                        text_match = text_pattern.search(doc_content)
                        if text_match:
                           doc_text = text_match.group(1).strip()
                           doc_text = ' '.join(doc_text.split())
                        else:
                            doc_text = ""

                        try:
                            json_line = json.dumps({"id": str(doc_id), "contents": str(doc_text)})
                            outfile.write(json_line + '\n')
                            doc_count += 1
                        except Exception as e_write:
                            print(f"Erreur lors de l'écriture JSON pour doc_id {doc_id}: {e_write}")

            except KeyError as e_key:
                print(f"\nAvertissement: Membre '{member.name}' inaccessible (KeyError): {e_key}")
                skipped_members += 1
            except EOFError:
                 print(f"\nAvertissement: Fin de fichier inattendue lors de la lecture de {member.name}.")
                 skipped_members += 1
            except Exception as e_extract:
                print(f"\nErreur inattendue lors de l'extraction/lecture du membre {member.name}: {e_extract}")
                skipped_members += 1

except tarfile.ReadError as e_tar:
    print(f"\nERREUR: Impossible de lire le fichier TAR {AP_TAR_PATH}. Erreur: {e_tar}")
    raise e_tar
except FileNotFoundError:
     print(f"\nERREUR: Le fichier TAR {AP_TAR_PATH} n'a pas été trouvé.")
     raise FileNotFoundError
except Exception as e_general:
     print(f"\nERREUR générale lors du traitement: {e_general}")
     traceback.print_exc()
     raise e_general

print(f"\n--- Fin de l'Extraction et Décompression ---")
print(f"  {file_read_count} fichiers (.gz/.Z) lus depuis l'archive.")
print(f"  {skipped_members} membres ignorés.")
if decompression_errors > 0:
    print(f"  {decompression_errors} erreurs ou avertissements de décompression rencontrés.")
print(f"  {doc_count} documents au total ont été formatés et écrits dans {JSONL_OUTPUT_PATH}")

if doc_count == 0 and file_read_count > 0:
     print("\n*** PROBLEME MAJEUR: Aucun document n'a été extrait ! Vérifiez les regex ou la structure interne des fichiers décompressés. ***")
elif doc_count < 100000 and file_read_count > 0:
     print("\n  ATTENTION: Le nombre de documents extraits semble faible.")

# Vérifier la taille du fichier de sortie
if os.path.exists(JSONL_OUTPUT_PATH):
    output_size = os.path.getsize(JSONL_OUTPUT_PATH)
    print(f"  Taille finale de {JSONL_OUTPUT_PATH}: {output_size} octets.")
    if output_size == 0 and doc_count == 0:
        print("  CONFIRMATION: Le fichier de sortie est vide.")
    elif output_size > 0 and doc_count > 0:
         print("  SUCCÈS: Le fichier de sortie contient des données.")



# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment dans la cellule de configuration complète
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (maintenant non vide)
# INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline") # Dossier cible
# CORPUS_DIR contient le fichier JSONL

# S'assurer que les variables sont définies (au cas où)
try:
    CORPUS_DIR
    INDEX_DIR_BASELINE
except NameError:
    print("ERREUR: Les variables CORPUS_DIR ou INDEX_DIR_BASELINE ne sont pas définies. Ré-exécutez la cellule de configuration.")
    # Optionnel: redéfinir ici, mais moins propre
    # OUTPUT_DIR = "/content/ap_output"
    # CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
    # INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
    raise

print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
# Pyserini utilise le dossier CORPUS_DIR comme entrée pour JsonCollection
print(f"Dossier source contenant ap_docs.jsonl: {CORPUS_DIR}")
print(f"Répertoire de l'index cible: {INDEX_DIR_BASELINE}")

# Vérifier si le fichier source existe et n'est pas vide
jsonl_source_path = os.path.join(CORPUS_DIR, "ap_docs.jsonl")
if not os.path.exists(jsonl_source_path) or os.path.getsize(jsonl_source_path) == 0:
     raise FileNotFoundError(f"Le fichier source {jsonl_source_path} est manquant ou vide. L'étape d'extraction ('extract_code_tar_gzip_fixed') a peut-être échoué ou n'a pas été exécutée.")

# Commande Pyserini pour l'indexation
# Utilise la dernière version de Pyserini installée
index_cmd_baseline = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR,
    "--index", INDEX_DIR_BASELINE,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté (ex: 2 ou 8 selon les ressources Colab)
    "--storePositions", "--storeDocvectors", "--storeRaw" # Options utiles pour certaines techniques avancées
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_baseline)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion des erreurs/sorties
    # Augmentation possible du timeout si l'indexation est très longue
    result = subprocess.run(index_cmd_baseline, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé malgré un fichier source non vide. Problème potentiel.")
    else:
         print(f"\nIndexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Baseline a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e # Arrêter si l'indexation échoue
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Baseline a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Baseline: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_BASELINE}...")
if os.path.exists(INDEX_DIR_BASELINE):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_BASELINE}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
import json
from tqdm.notebook import tqdm
import os
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl") # Fichier source (non vide)
# CORPUS_DIR

# S'assurer que les variables sont définies
try:
    CORPUS_DIR
    JSONL_OUTPUT_PATH
except NameError:
    print("ERREUR: Les variables CORPUS_DIR ou JSONL_OUTPUT_PATH ne sont pas définies. Ré-exécutez la cellule de configuration.")
    raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées depuis {JSONL_OUTPUT_PATH} vers {JSONL_PREPROC_PATH}...")

# S'assurer que la fonction preprocess_text est définie (normalement fait dans la cellule de setup)
if 'preprocess_text' not in globals():
    print("Erreur: La fonction 'preprocess_text' n'est pas définie. Ré-exécutez la cellule de configuration.")
    raise NameError("preprocess_text non définie")
else:
    doc_count_preproc = 0
    error_count = 0
    # Lire le fichier JSONL original et écrire le fichier prétraité
    try:
        # Utiliser utf-8 pour lire et écrire
        with open(JSONL_OUTPUT_PATH, 'r', encoding='utf-8') as infile, \
             open(JSONL_PREPROC_PATH, 'w', encoding='utf-8') as outfile:

            # Itérer sur le fichier d'entrée
            # Utiliser tqdm pour la barre de progression
            for line in tqdm(infile, desc="Prétraitement des documents"):
                try:
                    data = json.loads(line)
                    # Utiliser .get pour la robustesse si 'id' ou 'contents' manque
                    doc_id = data.get('id', None)
                    original_contents = data.get('contents', '')

                    if doc_id is None:
                        error_count += 1
                        continue

                    # Appliquer le prétraitement
                    preprocessed_contents = preprocess_text(original_contents)

                    # Écrire la nouvelle ligne JSONL
                    json_line = json.dumps({"id": str(doc_id), "contents": str(preprocessed_contents)})
                    outfile.write(json_line + '\n')
                    doc_count_preproc += 1

                except json.JSONDecodeError:
                    # print(f"Avertissement: Erreur de décodage JSON sur une ligne, ignorée.")
                    error_count += 1
                except Exception as e_line:
                    print(f"\nErreur inattendue lors du prétraitement d'une ligne (id={data.get('id', 'inconnu')}): {e_line}")
                    error_count += 1

        print(f"\nTerminé.")
        print(f"  {doc_count_preproc} documents prétraités et écrits dans {JSONL_PREPROC_PATH}")
        if error_count > 0:
             print(f"  {error_count} lignes ignorées à cause d'erreurs.")

        # Vérifier la taille du fichier de sortie
        if os.path.exists(JSONL_PREPROC_PATH):
            output_size = os.path.getsize(JSONL_PREPROC_PATH)
            print(f"  Taille finale de {JSONL_PREPROC_PATH}: {output_size} octets.")
            if output_size == 0 and doc_count_preproc > 0:
                 print("  ATTENTION: 0 octet écrit malgré le traitement de documents. Problème ?")
        else:
            print(f"  ATTENTION: Le fichier de sortie {JSONL_PREPROC_PATH} n'a pas été créé.")


    except FileNotFoundError:
        print(f"ERREUR: Le fichier d'entrée {JSONL_OUTPUT_PATH} n'a pas été trouvé.")
        raise
    except Exception as e_main:
        print(f"ERREUR générale lors de la préparation des données prétraitées: {e_main}")
        traceback.print_exc()
        raise



# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
import os # Assurer que os est importé
import subprocess # Pour exécuter la commande pyserini
import traceback # Pour afficher les erreurs détaillées

# Chemins définis précédemment
# JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Fichier source
# INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed") # Dossier cible pour l'index
# CORPUS_DIR contient le fichier JSONL prétraité

# S'assurer que les variables sont définies
try:
    CORPUS_DIR
    INDEX_DIR_PREPROC
except NameError:
    print("ERREUR: Les variables CORPUS_DIR ou INDEX_DIR_PREPROC ne sont pas définies. Ré-exécutez la cellule de configuration.")
    # Optionnel: redéfinir ici, mais moins propre
    # OUTPUT_DIR = "/content/ap_output"
    # CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
    # INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
    raise

JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl") # Chemin complet pour vérification

print(f"Début de l'indexation avec Prétraitement...")
# Note: Pyserini s'attend à un dossier en entrée pour JsonCollection,
# il trouvera ap_docs_preprocessed.jsonl dans CORPUS_DIR.
print(f"Collection source (dossier): {CORPUS_DIR}")
print(f"Fichier JSONL prétraité attendu: {JSONL_PREPROC_PATH}")
print(f"Répertoire de l'index cible: {INDEX_DIR_PREPROC}")

# Vérifier si le fichier prétraité existe et n'est pas vide
if not os.path.exists(JSONL_PREPROC_PATH) or os.path.getsize(JSONL_PREPROC_PATH) == 0:
    raise FileNotFoundError(f"Le fichier de données prétraitées {JSONL_PREPROC_PATH} est manquant ou vide. Assurez-vous que l'étape précédente (1.3) s'est bien terminée.")

# Commande Pyserini pour l'indexation prétraitée
index_cmd_preproc = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", CORPUS_DIR, # Pointeur vers le dossier contenant les jsonl
    "--index", INDEX_DIR_PREPROC,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "4", # Peut être ajusté
    "--storePositions", "--storeDocvectors", "--storeRaw",
    "--pretokenized" # Important: Indique que le texte est déjà tokenisé/traité
]

# Exécuter la commande
print(f"Exécution de la commande: {' '.join(index_cmd_preproc)}")
try:
    # Utiliser subprocess.run pour une meilleure gestion
    result = subprocess.run(index_cmd_preproc, check=True, capture_output=True, text=True, timeout=1800) # Timeout 30 minutes
    print("Sortie STDOUT:\n", result.stdout)
    print("Sortie STDERR:\n", result.stderr)
    # Vérifier si la sortie indique un nombre non nul de documents indexés
    if "Total 0 documents indexed" in result.stdout:
         print("\nATTENTION: Pyserini indique que 0 document a été indexé. Problème potentiel avec l'indexation prétraitée.")
    else:
        print(f"\nIndexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")
except subprocess.CalledProcessError as e:
    print(f"\nERREUR: L'indexation Prétraitée a échoué avec le code {e.returncode}")
    print("Sortie STDOUT:\n", e.stdout)
    print("Sortie STDERR:\n", e.stderr)
    raise e
except subprocess.TimeoutExpired as e:
    print(f"\nERREUR: L'indexation Prétraitée a dépassé le délai d'attente.")
    print("Sortie STDOUT (partielle):\n", e.stdout)
    print("Sortie STDERR (partielle):\n", e.stderr)
    raise e
except Exception as e:
    print(f"\nERREUR inattendue pendant l'indexation Prétraitée: {e}")
    traceback.print_exc()
    raise e

# Vérification finale de l'index (taille)
print(f"\nVérification de la taille de l'index créé dans {INDEX_DIR_PREPROC}...")
if os.path.exists(INDEX_DIR_PREPROC):
    # Commande pour obtenir la taille totale du dossier
    du_cmd = f"du -sh '{INDEX_DIR_PREPROC}'"
    try:
        result_du = subprocess.run(du_cmd, shell=True, check=True, capture_output=True, text=True)
        print(f"  Taille de l'index: {result_du.stdout.split()[0]}")
    except Exception as e_du:
        print(f"  Impossible de vérifier la taille de l'index: {e_du}")
else:
    print("  ATTENTION: Le dossier de l'index n'a pas été créé.")



# --- Nouvelle Cellule ---

# === Cellule 3.1: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Utilise la dernière Pyserini et Java 21 (devraient être actifs)
# S'assurer que les variables d'index et de requêtes sont définies

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées
import os # Assurer que os est importé
from jnius import autoclass, JavaException # Importer pour TF-IDF

# Essayer de définir K_RESULTS si ce n'est pas déjà fait
try:
    K_RESULTS
except NameError:
    print("Définition de K_RESULTS (nombre de résultats) à 1000...")
    K_RESULTS = 1000

# --- Configuration des modèles de similarité ---
# Charger la classe Java pour TF-IDF (ClassicSimilarity)
# Mettre dans un try-except au cas où l'import échouerait (peu probable maintenant)
try:
    ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
    print("Classe ClassicSimilarity (pour TF-IDF) chargée avec succès.")
except JavaException as e_load_class:
    print(f"ERREUR Java lors du chargement de ClassicSimilarity: {e_load_class}")
    print("Les recherches TF-IDF échoueront probablement.")
    ClassicSimilarity = None # Mettre à None pour pouvoir vérifier plus tard
except Exception as e_load_gen:
     print(f"ERREUR inattendue lors du chargement de ClassicSimilarity: {e_load_gen}")
     ClassicSimilarity = None

# Vérifier que les variables nécessaires existent
try:
    INDEX_DIR_BASELINE
    INDEX_DIR_PREPROC
    RUN_DIR
    queries_short
    queries_long
    queries_short_preprocessed
    queries_long_preprocessed
    preprocess_text # Vérifier aussi la fonction
except NameError as e_missing_var:
    print(f"ERREUR: Variable essentielle manquante ({e_missing_var}). L'environnement a peut-être été perdu. Ré-exécutez la cellule de configuration complète.")
    raise e_missing_var


def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25 or baseline_short_tfidf
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        # Assurer que LuceneSearcher est importé
        from pyserini.search.lucene import LuceneSearcher
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None:
                 print("ERREUR: Classe ClassicSimilarity non chargée. Impossible de configurer TF-IDF.")
                 print(f"--- ABANDON du run {run_tag} ---")
                 return # Ne pas continuer si la classe n'a pas pu être chargée

            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause de l'erreur de configuration TF-IDF ---")
                 return
            except Exception as e_other:
                 print(f"ERREUR Inattendue lors de la configuration de ClassicSimilarity: {e_other}")
                 print(traceback.format_exc())
                 print(f"--- ABANDON du run {run_tag} à cause d'une erreur TF-IDF ---")
                 return
        else:
            print(f"Modèle '{model}' non reconnu, utilisation de BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                # S'assurer que preprocess_text est défini
                if 'preprocess_text' not in globals():
                     raise NameError("La fonction preprocess_text n'est pas définie.")

                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                # Vérifier si la requête traitée est vide
                if not search_text.strip():
                     # print(f"  Avertissement: Requête QID {query_id} est vide après traitement, ignorée.")
                     continue # Ignorer les requêtes vides

                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    # S'assurer que doc_id n'est pas None (peut arriver dans de rares cas)
                    if doc_id is None:
                        # print(f"  Avertissement: Doc ID est None pour QID {query_id} au rang {rank}, ignoré.")
                        continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                # Compter les erreurs par requête mais continuer
                query_errors += 1
                if query_errors < 10: # Limiter l'affichage des erreurs par requête
                     print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                elif query_errors == 10:
                     print("\nPlusieurs erreurs de recherche pour ce run, messages suivants masqués...")


        # Écrire les résultats dans le fichier de run TREC
        if all_results_list:
             # Utiliser encoding='utf-8' pour l'écriture
             with open(output_run_file, 'w', encoding='utf-8') as f_out:
                f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes de résultats écrites.")
        else:
            print("\n  Avertissement: Aucun résultat généré pour ce run.")

        if query_errors > 0:
            print(f"  Avertissement: {query_errors} erreurs rencontrées lors de la recherche sur les requêtes individuelles.")

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")

    except Exception as e_main:
        # Erreur pendant l'initialisation du searcher ou configuration BM25
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc())
    finally:
        # En théorie, Pyserini/jnius gère la fermeture de la JVM, pas besoin de fermer le searcher explicitement
        if searcher:
             print(f"  Nettoyage implicite des ressources pour {run_tag}.")
             pass


# --- Exécution des 8 configurations de recherche (Séquentiel) ---

print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")


# --- Nouvelle Cellule ---

# === Cellule 7: Exécuter la Recherche Améliorée (RM3) ===
# Applique RM3 sur la meilleure configuration de base identifiée à l'étape 6.
# !! N'OUBLIEZ PAS DE CONFIGURER LES VARIABLES BEST_... CI-DESSOUS !!

from pyserini.search.lucene import LuceneSearcher
from jnius import autoclass, JavaException
from tqdm.notebook import tqdm
import time
import traceback
import os

# Recharger ClassicSimilarity au cas où
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
except Exception: ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; EVAL_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise

# --- À CONFIGURER selon vos meilleurs résultats de l'Étape 6 ---
# !! MODIFIEZ CECI EN FONCTION DE VOS RÉSULTATS D'ÉVALUATION !!
print("--- Configuration RM3 ---")
print("Veuillez éditer les variables BEST_... ci-dessous en fonction de vos meilleurs résultats MAP de l'étape précédente.")
# Exemple: si preproc + long + bm25 était le meilleur
BEST_INDEX_PATH = INDEX_DIR_PREPROC           # Ex: INDEX_DIR_BASELINE ou INDEX_DIR_PREPROC
BEST_QUERIES = queries_long_preprocessed      # Ex: queries_short, queries_long, ..._preprocessed
BEST_MODEL_BASE = 'bm25'                      # Ex: 'bm25' ou 'tfidf'
BEST_RUN_TAG_PREFIX = "preproc_long"          # Ex: 'baseline_short', 'preproc_long'
USE_PREPROC_QUERY_FOR_RM3 = False             # Généralement False si BEST_QUERIES est déjà prétraité
# ----------------------------------------------------------------
print(f"Configuration choisie pour RM3:")
print(f"  Index: {os.path.basename(BEST_INDEX_PATH)}")
# print(f"  Requêtes: (variable BEST_QUERIES)") # Difficile d'afficher le nom de la variable
print(f"  Modèle Base: {BEST_MODEL_BASE}")
print(f"  Préfixe Tag: {BEST_RUN_TAG_PREFIX}")
print(f"  Utiliser Preproc Requête?: {USE_PREPROC_QUERY_FOR_RM3}")

# Nom du fichier et tag pour le run RM3
PRF_RUN_FILE = os.path.join(RUN_DIR, f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3.txt")
RM3_RUN_TAG = f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3"

# Paramètres RM3
rm3_config = {'fb_terms': 10, 'fb_docs': 10, 'original_query_weight': 0.5}
print(f"  Paramètres RM3: {rm3_config}")

# --- Fonction de recherche RM3 (séquentielle) ---
# (Définition identique à celle de search_code_final, on peut la réutiliser si elle est dans la portée)
# Par sécurité, on la redéfinit ici au cas où l'utilisateur n'exécute que cette cellule après setup.
def perform_search_sequential_rm3(queries, index_path, model_base, k, output_run_file, run_tag, use_preprocessed_query=False, rm3_params=None):
    """Exécute la recherche RM3 séquentiellement."""
    start_time = time.time()
    print(f"\nDébut recherche SÉQUENTIELLE RM3: Modèle='{model_base}+RM3', Tag='{run_tag}', k={k}")
    all_results_list = []
    searcher = None
    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model_base == 'bm25': print("  Config BM25 (base)..."); searcher.set_bm25(k1=0.9, b=0.4)
        elif model_base == 'tfidf':
            if ClassicSimilarity is None: raise ValueError("ClassicSimilarity non chargée.")
            print("  Config ClassicSimilarity (base)...")
            try: searcher.set_similarity(ClassicSimilarity())
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle base '{model_base}' non reconnu, utilise BM25."); searcher.set_bm25()
        print("  Activation RM3..."); searcher.set_rm3(**rm3_params); print("  RM3 activé.")
        query_errors = 0
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue
                hits = searcher.search(search_text, k=k)
                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche RM3 QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche RM3...")
        if all_results_list:
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites.")
        else: print("\n  Avertissement: Aucun résultat RM3 généré.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")
        end_time = time.time()
        print(f"Recherche RM3 terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run RM3 {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# Lancer la recherche RM3 (après configuration des variables BEST_...)
print("\nLancement de la recherche RM3...")
perform_search_sequential_rm3(
    BEST_QUERIES, BEST_INDEX_PATH, BEST_MODEL_BASE, K_RESULTS,
    PRF_RUN_FILE, RM3_RUN_TAG,
    use_preprocessed_query=USE_PREPROC_QUERY_FOR_RM3, rm3_params=rm3_config
)

print("\n--- Exécution de la recherche RM3 terminée. ---")


# --- Nouvelle Cellule ---

# === Cellule 5: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Lance les 8 combinaisons de recherche et sauvegarde les résultats.
# Assurez-vous que l'environnement Java 21 est toujours actif.
# Assurez-vous que les index existent et que les variables sont définies.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import autoclass, JavaException # Pour TF-IDF

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Charger ClassicSimilarity
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity'); print("ClassicSimilarity chargée.")
except Exception as e: print(f"ERREUR chargement ClassicSimilarity: {e}"); ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    # Vérifier aussi l'existence des index
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed manquant: {INDEX_DIR_PREPROC}")
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model == 'bm25': print("  Config BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None: print("ERREUR: ClassicSimilarity non chargée. ABANDON."); return
            print("  Config ClassicSimilarity (TF-IDF)...")
            try: searcher.set_similarity(ClassicSimilarity()); print("  ClassicSimilarity configurée.")
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle '{model}' non reconnu, utilise BM25."); searcher.set_bm25()

        query_errors = 0
        # S'assurer que preprocess_text est défini avant la boucle
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # Créer le dossier RUN_DIR si besoin (normalement fait par setup)
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations ---
print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}


# --- Nouvelle Cellule ---

# === Cellule 5: Exécuter les Recherches (Séquentielles - BM25 & TF-IDF) ===
# Lance les 8 combinaisons de recherche et sauvegarde les résultats.
# Assurez-vous que l'environnement Java 21 est toujours actif.
# Assurez-vous que les index existent et que les variables sont définies.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import autoclass, JavaException # Pour TF-IDF

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Charger ClassicSimilarity
try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity'); print("ClassicSimilarity chargée.")
except Exception as e: print(f"ERREUR chargement ClassicSimilarity: {e}"); ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    # Vérifier aussi l'existence des index
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed manquant: {INDEX_DIR_PREPROC}")
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        if model == 'bm25': print("  Config BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'tfidf':
            if ClassicSimilarity is None: print("ERREUR: ClassicSimilarity non chargée. ABANDON."); return
            print("  Config ClassicSimilarity (TF-IDF)...")
            try: searcher.set_similarity(ClassicSimilarity()); print("  ClassicSimilarity configurée.")
            except Exception as e_sim: print(f"ERREUR config ClassicSimilarity: {e_sim}"); return
        else: print(f"Modèle '{model}' non reconnu, utilise BM25."); searcher.set_bm25()

        query_errors = 0
        # S'assurer que preprocess_text est défini avant la boucle
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             # Créer le dossier RUN_DIR si besoin (normalement fait par setup)
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations ---
print("\n--- DÉBUT DES RECHERCHES BASELINE ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt"); perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt"); perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt"); perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt"); perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)
print("\n--- Toutes les recherches de base (mode séquentiel) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}


# --- Nouvelle Cellule ---

OUTPUT_DIR = "/content/ap_output"
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")

# --- Nouvelle Cellule ---

# Chemin exact où les résultats de recherche sont attendus
RUN_DIR_PATH="/content/ap_output/runs/"

echo "Vérification du contenu de : ${RUN_DIR_PATH}"
ls -l ${RUN_DIR_PATH}


# --- Nouvelle Cellule ---

# === Cellule de Vérification du Contenu du Dossier Runs ===
# Utilise les commandes shell de Colab préfixées par '!'

# Chemin exact où les résultats de recherche sont attendus
# (Défini dans la cellule de configuration complète)
RUN_DIR_PATH="/content/ap_output/runs/"

# Utiliser '!' pour exécuter la commande shell 'echo'
print(f"Vérification du contenu de : {RUN_DIR_PATH}")

# Utiliser '!' pour exécuter la commande shell 'ls -l'
# Mettre le chemin entre guillemets pour gérer les espaces potentiels (même s'il n'y en a pas ici)
!ls -lh "{RUN_DIR_PATH}"


# --- Nouvelle Cellule ---

# === Cellule 2: Restauration des fichiers depuis Google Drive (Tout Inclus) ===
import os
import subprocess
import time

# Chemin où les fichiers ont été sauvegardés sur Drive
try: DRIVE_PROJECT_PATH # Défini dans la cellule précédente
except NameError: print("ERREUR: DRIVE_PROJECT_PATH non défini. Exécutez config complète."); raise

DRIVE_BACKUP_DIR = os.path.join(DRIVE_PROJECT_PATH, "colab_output_backup")

# Chemin cible dans Colab
TARGET_RESTORE_DIR = "/content/ap_output" # = OUTPUT_DIR défini précédemment

print(f"Source sur Drive : {DRIVE_BACKUP_DIR}")
print(f"Cible dans Colab : {TARGET_RESTORE_DIR}")

# Vérifier si le dossier de sauvegarde existe
if os.path.exists(DRIVE_BACKUP_DIR):
    os.makedirs(TARGET_RESTORE_DIR, exist_ok=True) # Créer dossier cible si besoin

    print("\nRestauration des fichiers (corpus et index) en cours... (Peut prendre plusieurs minutes)")
    # Commande de copie récursive
    copy_cmd = f"cp -r -v '{DRIVE_BACKUP_DIR}/.' '{TARGET_RESTORE_DIR}/'"
    try:
        process = subprocess.run(copy_cmd, shell=True, check=True, capture_output=True, text=True, timeout=900) # Timeout 15 minutes pour les index
        print("\nRestauration terminée avec succès !")
        print(f"Les fichiers de {DRIVE_BACKUP_DIR} ont été copiés dans {TARGET_RESTORE_DIR}")
        # Vérifier le contenu restauré (y compris les index)
        print("\nContenu du dossier restauré (partiel):")
        !ls -l {TARGET_RESTORE_DIR}
        print("\nContenu du dossier indexes (restauré):")
        !ls -l {TARGET_RESTORE_DIR}/indexes
    except subprocess.CalledProcessError as e:
         print(f"\nERREUR restauration (code {e.returncode}). Vérifiez si backup existe et contient corpus/, indexes/baseline/, indexes/preprocessed/.")
         print("STDERR:", e.stderr); raise e
    except Exception as e: print(f"\nERREUR restauration: {e}"); raise e
else:
    print(f"ERREUR: Dossier sauvegarde {DRIVE_BACKUP_DIR} inexistant.")
    print("Impossible de restaurer. Il faut relancer extraction et indexations.")
    raise FileNotFoundError(f"Dossier sauvegarde non trouvé: {DRIVE_BACKUP_DIR}")



# --- Nouvelle Cellule ---

# === Cellule 4: Exécuter les Recherches (Séquentielles - BM25 & QLD) ===
# Lance les 8 combinaisons de recherche en utilisant BM25 et QLD.
# S'assure que l'environnement Java 21 est actif et que les index/variables sont définis/restaurés.

from pyserini.search.lucene import LuceneSearcher # Import principal
import time
from tqdm.notebook import tqdm
import traceback
import os
from jnius import JavaException # Importer seulement JavaException, ClassicSimilarity n'est pas utilisé

# Définir K_RESULTS
try: K_RESULTS
except NameError: print("Définition K_RESULTS=1000"); K_RESULTS = 1000

# Vérifier variables nécessaires et existence des index restaurés
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
    if not os.path.exists(INDEX_DIR_BASELINE): raise FileNotFoundError(f"Index Baseline restauré manquant: {INDEX_DIR_BASELINE}")
    if not os.path.exists(INDEX_DIR_PREPROC): raise FileNotFoundError(f"Index Preprocessed restauré manquant: {INDEX_DIR_PREPROC}")
    # Vérifier aussi que les fichiers de corpus sont là (restaurés)
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs.jsonl")): raise FileNotFoundError("ap_docs.jsonl manquant après restauration.")
    if not os.path.exists(os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")): raise FileNotFoundError("ap_docs_preprocessed.jsonl manquant après restauration.")

except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise
except FileNotFoundError as e: print(f"ERREUR: {e}"); raise

def perform_search_sequential_qld(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes (BM25 ou QLD)."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}"
    print(f"\nDébut recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', k={k}")

    all_results_list = []
    searcher = None

    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")

        # Configurer similarité
        if model == 'bm25':
            print("  Configuration BM25..."); searcher.set_bm25(k1=0.9, b=0.4); print("  BM25 configuré.")
        elif model == 'qld': # Utiliser Query Likelihood Dirichlet
            print("  Configuration QLD..."); searcher.set_qld(); print("  QLD configuré.")
        else:
            print(f"Modèle '{model}' non reconnu, utilise BM25 par défaut."); searcher.set_bm25()

        # Itérer sur les requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")

        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue # Ignorer requêtes vides

                hits = searcher.search(search_text, k=k)

                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche...")

        # Écrire résultats
        if all_results_list:
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True)
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat généré pour ce run.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")

        end_time = time.time()
        print(f"Recherche terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# --- Exécution des 8 configurations (BM25 et QLD) ---
print("\n--- DÉBUT DES RECHERCHES BASELINE (BM25/QLD) ---")
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")
run_file_2 = os.path.join(RUN_DIR, "baseline_short_qld.txt"); perform_search_sequential_qld(queries_short, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_2, "baseline_short") # Utilise qld
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")
run_file_4 = os.path.join(RUN_DIR, "baseline_long_qld.txt"); perform_search_sequential_qld(queries_long, INDEX_DIR_BASELINE, 'qld', K_RESULTS, run_file_4, "baseline_long") # Utilise qld
print("\n--- DÉBUT DES RECHERCHES PRÉTRAITÉES (BM25/QLD) ---")
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
run_file_6 = os.path.join(RUN_DIR, "preproc_short_qld.txt"); perform_search_sequential_qld(queries_short_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False) # Utilise qld
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)
run_file_8 = os.path.join(RUN_DIR, "preproc_long_qld.txt"); perform_search_sequential_qld(queries_long_preprocessed, INDEX_DIR_PREPROC, 'qld', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False) # Utilise qld
print("\n--- Toutes les recherches de base (BM25/QLD) sont terminées. ---")

# Vérifier si des fichiers ont été créés
print(f"\nVérification du contenu de {RUN_DIR} après les recherches...")
!ls -l {RUN_DIR}



# --- Nouvelle Cellule ---

# === Cellule 6: Évaluation des Runs (BM25/QLD) ===
# Lit les fichiers Qrels, lit les fichiers de résultats (.txt) du dossier RUN_DIR,
# calcule MAP et P@10, et affiche/sauvegarde les tableaux récapitulatifs.
# Devrait maintenant évaluer les runs BM25 et QLD.

import pandas as pd
import glob
import pytrec_eval
import os
import traceback

# Vérifier que les chemins sont définis
try:
    QRELS_DIR
    RUN_DIR
    EVAL_DIR
except NameError:
    print("ERREUR: Variables de chemin non définies. Exécutez la cellule de configuration complète.")
    raise

print(f"Préparation des Qrels depuis: {QRELS_DIR}")
qrels_files = sorted(glob.glob(os.path.join(QRELS_DIR, "qrels.*.txt")))
if not qrels_files: print(f"ATTENTION: Aucun fichier Qrels trouvé dans {QRELS_DIR}."); qrels_dict = {}
else:
    print(f"Fichiers Qrels trouvés: {qrels_files}")
    all_qrels_data = []
    for qf in qrels_files:
        try:
            # Lire le fichier qrels en spécifiant les types pour éviter les erreurs
            qrels_df = pd.read_csv(qf, sep='\s+', names=['query_id', 'unused', 'doc_id', 'relevance'],
                                   dtype={'query_id': str, 'unused': str, 'doc_id': str, 'relevance': int})
            all_qrels_data.append(qrels_df[['query_id', 'doc_id', 'relevance']])
        except Exception as e: print(f"Erreur lecture Qrels {qf}: {e}")
    if not all_qrels_data: print("ERREUR: Impossible lire données Qrels."); qrels_dict = {}
    else:
        combined_qrels_df = pd.concat(all_qrels_data, ignore_index=True)
        qrels_dict = {}
        # Convertir le DataFrame en dictionnaire attendu par pytrec_eval
        for _, row in combined_qrels_df.iterrows():
            qid, did, rel = str(row['query_id']), str(row['doc_id']), int(row['relevance'])
            if rel < 0: continue # Ignorer jugements négatifs
            if qid not in qrels_dict: qrels_dict[qid] = {}
            qrels_dict[qid][did] = rel
        print(f"Total {len(qrels_dict)} requêtes avec jugements chargées.")

# --- Évaluation des Runs ---
if not qrels_dict: print("\nAucun jugement de pertinence chargé, impossible d'évaluer.")
else:
    measures = {'map', 'P_10'} # Métriques à calculer
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, measures) # Initialiser l'évaluateur
    # Trouver tous les fichiers .txt dans le dossier des runs
    run_files = sorted(glob.glob(os.path.join(RUN_DIR, "*.txt")))
    print(f"\n{len(run_files)} fichiers de run à évaluer trouvés dans {RUN_DIR}.")
    print(f"  Fichiers: {[os.path.basename(f) for f in run_files]}") # Afficher les noms

    results_summary = [] # Liste pour stocker les résultats agrégés
    if not run_files: print(f"ATTENTION: Aucun fichier de run (.txt) trouvé dans {RUN_DIR}.")
    else:
        # Boucler sur chaque fichier de run trouvé
        for run_file in run_files:
            run_name = os.path.basename(run_file)
            print(f"\n--- Évaluation: {run_name} ---")
            run_dict = {} # Dictionnaire pour stocker les résultats de ce run
            error_count = 0
            line_count = 0
            try:
                # Lire le fichier run ligne par ligne
                with open(run_file, 'r', encoding='utf-8') as f_run:
                    for line in f_run:
                        line_count += 1
                        parts = line.strip().split()
                        # Vérifier le format TREC (6 colonnes)
                        if len(parts) != 6: error_count += 1; continue
                        qid, _, did, _, score, _ = parts # Extraire les infos utiles
                        try: score = float(score) # Convertir le score en float
                        except ValueError: error_count += 1; continue
                        qid = str(qid) # Assurer que qid est une chaîne
                        # Stocker le score pour ce document et cette requête
                        if qid not in run_dict: run_dict[qid] = {}
                        run_dict[qid][did] = score
                if error_count > 0: print(f"  Avertissement: {error_count} lignes mal formatées ignorées sur {line_count} lignes.")

                # Filtrer le run pour ne garder que les requêtes présentes dans les Qrels
                filtered_run_dict = {qid: docs for qid, docs in run_dict.items() if qid in qrels_dict}
                ignored_q = len(run_dict) - len(filtered_run_dict)
                if ignored_q > 0: print(f"  Avertissement: {ignored_q} requêtes run ignorées (absentes Qrels).")
                if not filtered_run_dict: print("  Erreur: Aucune requête ne correspond aux Qrels."); continue

                # Évaluer le run filtré avec pytrec_eval
                eval_results = evaluator.evaluate(filtered_run_dict)
                # Calculer les moyennes des métriques sur toutes les requêtes évaluées
                all_maps = [q_res.get("map", 0) for q_res in eval_results.values()]
                all_p10s = [q_res.get("P_10", 0) for q_res in eval_results.values()]
                avg_map = sum(all_maps) / len(all_maps) if all_maps else 0
                avg_p10 = sum(all_p10s) / len(all_p10s) if all_p10s else 0

                # Afficher les résultats moyens pour ce run
                print(f"  MAP: {avg_map:.4f}")
                print(f"  P@10: {avg_p10:.4f}")

                # Extraire les informations du nom de fichier pour le résumé
                parts = run_name.replace('.txt','').split('_')
                if len(parts) >= 3:
                    index_type, query_type, model_type = parts[0], parts[1], parts[2]
                    # Gérer le tag RM3 s'il est présent (pour l'évaluation finale)
                    if len(parts) > 3 and parts[-1] == 'rm3':
                         model_type = "_".join(parts[2:]) # Ex: BM25_RM3 ou QLD_RM3
                    else:
                         model_type = "_".join(parts[2:]) # Ex: BM25 ou QLD

                    # Ajouter les résultats au résumé
                    results_summary.append({
                        "Run Name": run_name, "Index": index_type,
                        "Query Type": query_type.capitalize(),
                        "Weighting Scheme": model_type.upper().replace('_', '+'), # Formatage pour affichage
                        "MAP": avg_map, "P@10": avg_p10
                    })
                else: print(f"  Avertissement: Impossible parser nom run '{run_name}'.")

            except FileNotFoundError: print(f"  Erreur: Fichier run non trouvé: {run_file}")
            except Exception as e: print(f"  Erreur évaluation {run_name}: {e}"); traceback.print_exc()

        # Afficher et sauvegarder le résumé final
        if results_summary:
            print("\n\n=== Tableau Récapitulatif des Résultats (BM25/QLD) ===")
            results_df = pd.DataFrame(results_summary)
            # Trier pour une meilleure lisibilité
            results_df = results_df.sort_values(by=["Index", "Query Type", "Weighting Scheme"])

            # Afficher le DataFrame complet
            print("\n--- Résultats Complets ---")
            print(results_df.to_markdown(index=False, floatfmt=".4f"))

            # Essayer d'afficher les tableaux pivots
            try:
                pivot_map = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='MAP')
                print("\n--- MAP (Tableau Pivot) ---")
                print(pivot_map.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot MAP: {e_pivot})")

            try:
                pivot_p10 = results_df.pivot_table(index=['Query Type', 'Weighting Scheme'], columns='Index', values='P@10')
                print("\n--- P@10 (Tableau Pivot) ---")
                print(pivot_p10.to_markdown(floatfmt=".4f"))
            except Exception as e_pivot: print(f"\n(Erreur création tableau pivot P@10: {e_pivot})")

            # Sauvegarder le DataFrame complet final
            summary_file_path = os.path.join(EVAL_DIR, "evaluation_summary_final.csv")
            try:
                 results_df.to_csv(summary_file_path, index=False)
                 print(f"\nTableau récapitulatif complet sauvegardé: {summary_file_path}")
            except Exception as e_save: print(f"\nErreur sauvegarde résumé: {e_save}")
        else: print("\nAucun résultat d'évaluation à afficher.")



# --- Nouvelle Cellule ---

# === Cellule 7: Exécuter la Recherche Améliorée (RM3) ===
# Applique RM3 sur la meilleure configuration de base identifiée à l'étape 6.
# !! N'OUBLIEZ PAS DE CONFIGURER LES VARIABLES BEST_... CI-DESSOUS !!

from pyserini.search.lucene import LuceneSearcher
from jnius import autoclass, JavaException
from tqdm.notebook import tqdm
import time
import traceback
import os

# Recharger ClassicSimilarity n'est plus nécessaire car on utilise BM25/QLD
# try: ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')
# except Exception: ClassicSimilarity = None

# Vérifier variables nécessaires
try:
    INDEX_DIR_BASELINE; INDEX_DIR_PREPROC; RUN_DIR; K_RESULTS; EVAL_DIR;
    queries_short; queries_long; queries_short_preprocessed; queries_long_preprocessed;
    preprocess_text;
except NameError as e: print(f"ERREUR: Variable {e} manquante. Exécutez config complète."); raise

# --- À CONFIGURER selon vos meilleurs résultats de l'Étape 6 (BM25/QLD) ---
# !! MODIFIEZ CECI EN FONCTION DE VOS RÉSULTATS D'ÉVALUATION !!
print("--- Configuration RM3 ---")
print("Veuillez éditer les variables BEST_... ci-dessous en fonction de vos meilleurs résultats MAP de l'étape précédente.")
# Exemple: si preproc + long + bm25 était le meilleur
BEST_INDEX_PATH = INDEX_DIR_PREPROC           # Ex: INDEX_DIR_BASELINE ou INDEX_DIR_PREPROC
BEST_QUERIES = queries_long_preprocessed      # Ex: queries_short, queries_long, ..._preprocessed
BEST_MODEL_BASE = 'bm25'                      # Ex: 'bm25' ou 'qld' (celui qui a donné le meilleur MAP)
BEST_RUN_TAG_PREFIX = "preproc_long"          # Ex: 'baseline_short', 'preproc_long'
USE_PREPROC_QUERY_FOR_RM3 = False             # Généralement False si BEST_QUERIES est déjà prétraité
# ----------------------------------------------------------------
print(f"Configuration choisie pour RM3:")
print(f"  Index: {os.path.basename(BEST_INDEX_PATH)}")
# print(f"  Requêtes: (variable BEST_QUERIES)") # Difficile d'afficher le nom de la variable
print(f"  Modèle Base: {BEST_MODEL_BASE}")
print(f"  Préfixe Tag: {BEST_RUN_TAG_PREFIX}")
print(f"  Utiliser Preproc Requête?: {USE_PREPROC_QUERY_FOR_RM3}")

# Nom du fichier et tag pour le run RM3
PRF_RUN_FILE = os.path.join(RUN_DIR, f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3.txt")
RM3_RUN_TAG = f"{BEST_RUN_TAG_PREFIX}_{BEST_MODEL_BASE}_rm3"

# Paramètres RM3
rm3_config = {'fb_terms': 10, 'fb_docs': 10, 'original_query_weight': 0.5}
print(f"  Paramètres RM3: {rm3_config}")

# --- Fonction de recherche RM3 (séquentielle) ---
def perform_search_sequential_rm3(queries, index_path, model_base, k, output_run_file, run_tag, use_preprocessed_query=False, rm3_params=None):
    """Exécute la recherche RM3 séquentiellement."""
    start_time = time.time()
    print(f"\nDébut recherche SÉQUENTIELLE RM3: Modèle='{model_base}+RM3', Tag='{run_tag}', k={k}")
    all_results_list = []
    searcher = None
    try:
        print(f"  Initialisation LuceneSearcher..."); searcher = LuceneSearcher(index_path); print(f"  LuceneSearcher initialisé.")
        # Configurer similarité base
        if model_base == 'bm25': print("  Config BM25 (base)..."); searcher.set_bm25(k1=0.9, b=0.4)
        elif model_base == 'qld': print("  Config QLD (base)..."); searcher.set_qld()
        else: print(f"Modèle base '{model_base}' non reconnu, utilise BM25."); searcher.set_bm25()
        # Activer RM3
        print("  Activation RM3..."); searcher.set_rm3(**rm3_params); print("  RM3 activé.")
        # Itérer sur requêtes
        query_errors = 0
        if 'preprocess_text' not in globals(): raise NameError("preprocess_text non définie.")
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                if not search_text.strip(): continue
                hits = searcher.search(search_text, k=k)
                for i in range(len(hits)):
                    rank, doc_id, score = i + 1, hits[i].docid, hits[i].score
                    if doc_id is None: continue
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")
            except Exception as e_query:
                query_errors += 1
                if query_errors < 5: print(f"\nErreur recherche RM3 QID {query_id}: {e_query}")
                elif query_errors == 5: print("\nPlusieurs erreurs recherche RM3...")
        # Écrire résultats
        if all_results_list:
             os.makedirs(os.path.dirname(output_run_file), exist_ok=True) # Assurer que le dossier existe
             with open(output_run_file, 'w', encoding='utf-8') as f_out: f_out.writelines(all_results_list)
             print(f"\n  {len(all_results_list)} lignes résultats écrites dans {os.path.basename(output_run_file)}.")
        else: print("\n  Avertissement: Aucun résultat RM3 généré.")
        if query_errors > 0: print(f"  Avertissement: {query_errors} erreurs sur requêtes.")
        end_time = time.time()
        print(f"Recherche RM3 terminée pour {run_tag}. Sauvegardé dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.")
    except Exception as e_main: print(f"\nERREUR MAJEURE run RM3 {run_tag}: {e_main}"); traceback.print_exc()
    finally:
        if searcher: print(f"  Nettoyage implicite ressources {run_tag}.")

# Lancer la recherche RM3 (après configuration des variables BEST_...)
print("\nLancement de la recherche RM3...")
perform_search_sequential_rm3(
    BEST_QUERIES, BEST_INDEX_PATH, BEST_MODEL_BASE, K_RESULTS,
    PRF_RUN_FILE, RM3_RUN_TAG,
    use_preprocessed_query=USE_PREPROC_QUERY_FOR_RM3, rm3_params=rm3_config
)

print("\n--- Exécution de la recherche RM3 terminée. ---")
# Vérifier si le fichier a été créé
print(f"\nVérification de la création du fichier {PRF_RUN_FILE}...")
!ls -l "{PRF_RUN_FILE}"



# --- Nouvelle Cellule ---

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download("punkt")

ps = PorterStemmer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    filtered = [ps.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(filtered)

# --- Nouvelle Cellule ---

!pip install pysolr


# --- Nouvelle Cellule ---

# === Cellule 0.2: Installation des bibliothèques ===
# Pyserini nécessite Java 11, installons-le
!apt-get update -qq > /dev/null && apt-get install -y openjdk-11-jdk-headless -qq > /dev/null

# Installer Pyserini, NLTK et Pytrec_eval
!pip install pyserini==0.24.0 -q # Installe une version spécifique pour la stabilité
!pip install nltk -q
!pip install pytrec_eval -q

# Définir la variable d'environnement JAVA_HOME
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

# Télécharger les ressources NLTK nécessaires
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) # Ajouté pour WordNet

print("Installation terminée et ressources NLTK téléchargées.")

# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin dans la Cellule 0.1 et 0.3.")

AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "/content/drive/MyDrive/Projet_RI/AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "/content/drive/MyDrive/Projet_RI/topics/")
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "/content/drive/MyDrive/Projet_RI/ql/")

# Chemins pour les sorties (index, résultats, etc.) dans l'environnement Colab
OUTPUT_DIR = "/content/drive/MyDrive/Projet_RI/output/"
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/pre")
CORPUS_DIR = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/Corpus") # Pour les documents extraits/formatés
RUN_DIR = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/runs") # Pour les fichiers de résultats TREC
EVAL_DIR = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/Projet_RI/eval") # Pour les fichiers d'évaluation

# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

print(f"Chemin du projet Drive: {DRIVE_PROJECT_PATH}")
print(f"Répertoire de sortie Colab: {OUTPUT_DIR}")

# --- Nouvelle Cellule ---

# === Cellule 0.4: Extraire et Formater les Documents ===
import tarfile
import re
import json
from tqdm.notebook import tqdm # Barre de progression

# Chemin vers le fichier JSONL qui sera généré
JSONL_OUTPUT_PATH = os.path.join(CORPUS_DIR, "ap_docs.jsonl")

print(f"Extraction et formatage des documents depuis {AP_TAR_PATH} vers {JSONL_OUTPUT_PATH}...")

# Regex pour extraire DOCNO et TEXT
doc_pattern = re.compile(r"<DOC>(.*?)</DOC>", re.DOTALL)
docno_pattern = re.compile(r"<DOCNO>\s*(.*?)\s*</DOCNO>")
text_pattern = re.compile(r"<TEXT>(.*?)</TEXT>", re.DOTALL)

# Compteur pour vérifier
doc_count = 0

# Ouvrir/créer le fichier JSONL de sortie
with open(JSONL_OUTPUT_PATH, 'w') as outfile, tarfile.open(AP_TAR_PATH, "r") as tar:  # Changed mode to "r" # Assurez-vous que c'est bien .gz
    # Itérer sur chaque membre (fichier/dossier) dans l'archive tar
    for member in tqdm(tar.getmembers(), desc="Traitement des fichiers TAR"):
        # Vérifier si c'est un fichier régulier
        if member.isfile():
            # Extraire le contenu du fichier
            f = tar.extractfile(member)
            if f: # S'assurer que l'extraction a réussi
                content = f.read().decode('utf-8', errors='ignore') # Lire et décoder

                # Trouver tous les documents dans le fichier actuel
                for doc_match in doc_pattern.finditer(content):
                    doc_content = doc_match.group(1)

                    # Extraire DOCNO
                    docno_match = docno_pattern.search(doc_content)
                    if not docno_match:
                        continue # Passer si pas de DOCNO
                    doc_id = docno_match.group(1).strip()

                    # Extraire TEXT (et le nettoyer un peu)
                    text_match = text_pattern.search(doc_content)
                    if text_match:
                       doc_text = text_match.group(1).strip()
                       # Nettoyage simple: remplacer les nouvelles lignes par des espaces
                       doc_text = ' '.join(doc_text.split())
                    else:
                        doc_text = "" # Mettre une chaîne vide si pas de champ TEXT

                    # Écrire l'entrée JSONL
                    json_line = json.dumps({"id": doc_id, "contents": doc_text})
                    outfile.write(json_line + '\n')
                    doc_count += 1

print(f"Terminé. {doc_count} documents formatés dans {JSONL_OUTPUT_PATH}")
# Note: La collection AP88-90 contient environ 164 597 documents. Vérifiez si ce nombre est proche.
# Si AP.tar.gz contient des sous-dossiers (ap88, ap89, etc.), ce code devrait fonctionner.
# Si AP.tar.gz contient directement les fichiers ap88xxxx, cela fonctionnera aussi.
# Si c'est juste AP.tar (non compressé), changez "r:gz" en "r:"

# --- Nouvelle Cellule ---

# === Cellule 1.1: Fonction de Prétraitement ===
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Applique la tokenisation, la mise en minuscule, la suppression
    de la ponctuation, la suppression des stop words et la lemmatisation.
    """
    # Tokenisation et minuscules
    tokens = word_tokenize(text.lower())

    # Suppression ponctuation et mots non alphabétiques + stop words
    filtered_tokens = [
        lemmatizer.lemmatize(w) for w in tokens
        if w.isalpha() and w not in stop_words # Garde seulement les mots alphabétiques non-stop words
    ]

    # Rejoint les tokens en une chaîne de caractères
    return ' '.join(filtered_tokens)

# Exemple d'utilisation
sample_text = "This is an example showing Information Retrieval with lemmatization and stop words removal."
preprocessed_sample = preprocess_text(sample_text)
print(f"Original: {sample_text}")
print(f"Preprocessed: {preprocessed_sample}")

# --- Nouvelle Cellule ---

# === Cellule 1.2: Indexation Baseline ===
print(f"Début de l'indexation Baseline (sans prétraitement explicite)...")
print(f"Collection source: {JSONL_OUTPUT_PATH}")
print(f"Répertoire de l'index: {INDEX_DIR_BASELINE}")

# Commande Pyserini pour l'indexation
# -input: dossier contenant les fichiers JSONL
# -collection: type de collection (JsonCollection pour nos fichiers .jsonl)
# -generator: comment traiter les fichiers (LuceneDocumentGenerator crée un document par ligne JSON)
# -index: chemin où sauvegarder l'index
# -threads: nombre de threads à utiliser (ajustez selon les ressources Colab, 4 est raisonnable)
# -storePositions -storeDocvectors -storeRaw: stocke informations supplémentaires utiles pour certaines recherches avancées (comme le re-ranking ou PRF)
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input {CORPUS_DIR} \
  --index {INDEX_DIR_BASELINE} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions --storeDocvectors --storeRaw

print(f"Indexation Baseline terminée. Index créé dans {INDEX_DIR_BASELINE}")

# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées vers {JSONL_PREPROC_PATH}...")

doc_count_preproc = 0
# Lire le fichier JSONL original et écrire le fichier prétraité
with open(JSONL_OUTPUT_PATH, 'r') as infile, open(JSONL_PREPROC_PATH, 'w') as outfile:
    for line in tqdm(infile, desc="Prétraitement des documents"):
        try:
            data = json.loads(line)
            doc_id = data['id']
            original_contents = data['contents']

            # Appliquer le prétraitement
            preprocessed_contents = preprocess_text(original_contents)

            # Écrire la nouvelle ligne JSONL
            json_line = json.dumps({"id": doc_id, "contents": preprocessed_contents})
            outfile.write(json_line + '\n')
            doc_count_preproc += 1
        except json.JSONDecodeError:
            print(f"Erreur de décodage JSON sur une ligne, ignorée.") # Au cas où une ligne serait malformée
        except Exception as e:
            print(f"Erreur inattendue lors du prétraitement: {e}") # Autres erreurs possibles

print(f"Terminé. {doc_count_preproc} documents prétraités dans {JSONL_PREPROC_PATH}")

# --- Nouvelle Cellule ---

# === Cellule 1.3: Préparer les Données Prétraitées ===
JSONL_PREPROC_PATH = os.path.join(CORPUS_DIR, "ap_docs_preprocessed.jsonl")

print(f"Préparation des données prétraitées vers {JSONL_PREPROC_PATH}...")

doc_count_preproc = 0
# Lire le fichier JSONL original et écrire le fichier prétraité
with open(JSONL_OUTPUT_PATH, 'r') as infile, open(JSONL_PREPROC_PATH, 'w') as outfile:
    for line in tqdm(infile, desc="Prétraitement des documents"):
        try:
            data = json.loads(line)
            doc_id = data['id']
            original_contents = data['contents']

            # Appliquer le prétraitement
            preprocessed_contents = preprocess_text(original_contents)

            # Écrire la nouvelle ligne JSONL
            json_line = json.dumps({"id": doc_id, "contents": preprocessed_contents})
            outfile.write(json_line + '\n')
            doc_count_preproc += 1
        except json.JSONDecodeError:
            print(f"Erreur de décodage JSON sur une ligne, ignorée.") # Au cas où une ligne serait malformée
        except Exception as e:
            print(f"Erreur inattendue lors du prétraitement: {e}") # Autres erreurs possibles

print(f"Terminé. {doc_count_preproc} documents prétraités dans {JSONL_PREPROC_PATH}")

# --- Nouvelle Cellule ---

# === Cellule 1.4: Indexation Avec Prétraitement ===
print(f"Début de l'indexation avec Prétraitement...")
print(f"Collection source: {JSONL_PREPROC_PATH}") # Utilise le fichier .jsonl prétraité
print(f"Répertoire de l'index: {INDEX_DIR_PREPROC}")

# La commande est identique, mais pointe vers le fichier JSONL prétraité
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input {CORPUS_DIR} \
  --index {INDEX_DIR_PREPROC} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions --storeDocvectors --storeRaw \
  --pretokenized # Important: Indique que le texte est déjà tokenisé (évite une re-tokenisation par Lucene)

print(f"Indexation avec Prétraitement terminée. Index créé dans {INDEX_DIR_PREPROC}")

# --- Nouvelle Cellule ---

# === Cellule 2.1: Parser les Fichiers Topics ===
import glob # Pour trouver les fichiers correspondant à un pattern

def parse_topics(file_path):
    """Parse un fichier topic TREC standard."""
    topics = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        # Utilise regex pour trouver chaque bloc <top>
        for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
            topic_content = top_match.group(1)
            # Extrait le numéro (num)
            num_match = re.search(r"<num>\s*Number:\s*(\d+)", topic_content, re.IGNORECASE)
            if not num_match: continue
            topic_id = num_match.group(1).strip()

            # Extrait le titre (title) - prend tout après <title> jusqu'au prochain tag
            title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
            title = title_match.group(1).strip() if title_match else ""

            # Extrait la description (desc)
            desc_match = re.search(r"<desc>\s*Description:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
            desc = desc_match.group(1).strip() if desc_match else ""

            # Extrait la narrative (narr) - pas utilisée ici mais pourrait l'être
            # narr_match = re.search(r"<narr>\s*Narrative:\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
            # narr = narr_match.group(1).strip() if narr_match else ""

            if topic_id and title: # Au moins un ID et un titre
                 topics[topic_id] = {'title': title, 'desc': desc}
    return topics

# Trouver tous les fichiers topics
topic_files = sorted(glob.glob(os.path.join(TOPICS_DIR, "topics.*.txt")))
print(f"Fichiers topics trouvés: {topic_files}")

all_topics = {}
for tf in topic_files:
    print(f"Parsing {tf}...")
    all_topics.update(parse_topics(tf))

print(f"Total de {len(all_topics)} topics parsés.")

# Créer les dictionnaires de requêtes courtes et longues
queries_short = {qid: data['title'] for qid, data in all_topics.items()}
queries_long = {qid: data['title'] + " " + data['desc'] for qid, data in all_topics.items()} # Concatène titre et description

# Optionnel: Créer des versions prétraitées des requêtes
queries_short_preprocessed = {qid: preprocess_text(q) for qid, q in queries_short.items()}
queries_long_preprocessed = {qid: preprocess_text(q) for qid, q in queries_long.items()}

print(f"Exemple Requête Courte (ID 51): {queries_short.get('51', 'Non trouvé')}")
print(f"Exemple Requête Longue (ID 51): {queries_long.get('51', 'Non trouvé')}")
print(f"Exemple Requête Courte Prétraitée (ID 51): {queries_short_preprocessed.get('51', 'Non trouvé')}")
print(f"Exemple Requête Longue Prétraitée (ID 51): {queries_long_preprocessed.get('51', 'Non trouvé')}")

# --- Nouvelle Cellule ---

# === Cellule 3.1: Fonction de Recherche et Sauvegarde ===
from pyserini.search.lucene import LuceneSearcher
import time
from multiprocessing import Pool, cpu_count

# --- Configuration des modèles de similarité ---
# Pyserini/Lucene utilise BM25 par défaut (avec k1=0.9, b=0.4)
# Pour TF-IDF, nous utilisons ClassicSimilarity de Lucene.
# Cela nécessite d'importer la classe Java via Pyjnius (le pont Python-Java de Pyserini)
from jnius import autoclass
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_single_query(args):
    """Fonction exécutée par chaque processus pour une seule requête."""
    query_id, query_text, index_path, model, k, run_tag, use_preprocessed_query = args

    try:
        # Initialiser le searcher DANS le processus fils
        searcher = LuceneSearcher(index_path)

        # Configurer le modèle de similarité
        if model == 'bm25':
            # Utiliser les valeurs par défaut de Pyserini ou spécifier les vôtres
            searcher.set_bm25(k1=0.9, b=0.4) # Valeurs standard BM25 TREC
        elif model == 'tfidf':
            searcher.set_similarity(ClassicSimilarity()) # Appliquer TF-IDF (ClassicSimilarity)
        else:
            # Par défaut ou erreur
            searcher.set_bm25() # Rétablir BM25 par sécurité

        # Prétraiter la requête si nécessaire (pour l'index prétraité)
        search_text = preprocess_text(query_text) if use_preprocessed_query else query_text

        # Exécuter la recherche
        hits = searcher.search(search_text, k=k)

        # Formater les résultats pour cette requête
        query_results = []
        for i in range(len(hits)):
            rank = i + 1
            doc_id = hits[i].docid
            score = hits[i].score
            # Format TREC: qid Q0 docid rank score run_tag
            query_results.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

        return query_results

    except Exception as e:
        print(f"Erreur lors de la recherche pour QID {query_id} avec {run_tag}: {e}")
        return [] # Retourne une liste vide en cas d'erreur


def run_search_parallel(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche en parallèle pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    # Préparer les arguments pour chaque tâche de processus
    tasks = []
    for query_id, query_text in queries.items():
        tasks.append((query_id, query_text, index_path, model, k, run_tag, use_preprocessed_query))

    # Utiliser un Pool de processus pour la parallélisation
    # Utiliser N-1 coeurs pour laisser un peu de marge, ou cpu_count()
    num_workers = max(1, cpu_count() - 1)
    print(f"Utilisation de {num_workers} processus parallèles...")

    all_results_list = []
    # Utiliser tqdm pour la barre de progression avec le Pool
    with Pool(num_workers) as pool:
       # pool.imap_unordered exécute les tâches et retourne les résultats dès qu'ils sont prêts
       # Cela peut être plus rapide si certaines requêtes prennent plus de temps
       results_iterator = pool.imap_unordered(perform_search_single_query, tasks)
       # Envelopper avec tqdm pour la barre de progression
       for result in tqdm(results_iterator, total=len(tasks), desc=f"Recherche {run_tag}"):
           all_results_list.extend(result) # Ajouter les lignes de résultats retournées par chaque processus


    # Écrire les résultats dans le fichier de run TREC
    with open(output_run_file, 'w') as f_out:
       f_out.writelines(all_results_list)

    end_time = time.time()
    print(f"Recherche terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
    print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")


# --- Exécution des différentes configurations ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête (standard TREC)

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
run_search_parallel(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
run_search_parallel(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
run_search_parallel(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
run_search_parallel(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# Important: Utiliser les requêtes prétraitées correspondantes

# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
run_search_parallel(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)
 # Note: Les requêtes sont déjà prétraitées, donc use_preprocessed_query=False dans la fonction
 #       (car elle applique preprocess_text si True) - c'est un peu contre-intuitif
 #       Alternative: passer `queries_short` et mettre `use_preprocessed_query=True`. Choisissons la première option pour la clarté.

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
run_search_parallel(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
run_search_parallel(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
run_search_parallel(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base sont terminées.")

# --- Nouvelle Cellule ---

# === Cellule 3.1 (Modifiée): Fonction de Recherche et Sauvegarde (Séquentielle d'abord) ===
from pyserini.search.lucene import LuceneSearcher
import time
from tqdm.notebook import tqdm # Toujours utile pour la progression
import traceback # Pour afficher les erreurs détaillées

# --- Configuration des modèles de similarité ---
from jnius import autoclass, JavaException
ClassicSimilarity = autoclass('org.apache.lucene.search.similarities.ClassicSimilarity')

def perform_search_sequential(queries, index_path, model, k, output_run_file, run_tag_prefix, use_preprocessed_query=False):
    """Exécute la recherche séquentiellement pour un ensemble de requêtes."""
    start_time = time.time()
    run_tag = f"{run_tag_prefix}_{model}" # Ex: baseline_short_bm25
    print(f"Début recherche SÉQUENTIELLE: Index='{os.path.basename(index_path)}', Modèle='{model}', Tag='{run_tag}', PréprocReq={use_preprocessed_query}, k={k}")

    all_results_list = []
    searcher = None # Initialiser à None

    try:
        # Initialiser le searcher UNE SEULE FOIS pour toutes les requêtes de ce run
        print(f"  Initialisation de LuceneSearcher pour {run_tag}...")
        searcher = LuceneSearcher(index_path)
        print(f"  LuceneSearcher initialisé.")

        # Configurer le modèle de similarité
        if model == 'bm25':
            print("  Configuration de BM25...")
            searcher.set_bm25(k1=0.9, b=0.4)
            print("  BM25 configuré.")
        elif model == 'tfidf':
            print("  Configuration de ClassicSimilarity (TF-IDF)...")
            try:
                 searcher.set_similarity(ClassicSimilarity())
                 print("  ClassicSimilarity configurée.")
            except JavaException as e:
                 print(f"ERREUR Java lors de la configuration de ClassicSimilarity: {e}")
                 print(traceback.format_exc()) # Affiche la trace complète de l'erreur Java
                 raise # Arrête l'exécution pour ce run si la similarité ne peut être définie
        else:
            print("  Configuration BM25 par défaut...")
            searcher.set_bm25()
            print("  BM25 par défaut configuré.")

        # Itérer sur les requêtes séquentiellement
        for query_id, query_text in tqdm(queries.items(), desc=f"Recherche {run_tag}"):
            try:
                search_text = preprocess_text(query_text) if use_preprocessed_query else query_text
                hits = searcher.search(search_text, k=k)

                # Formater les résultats pour cette requête
                for i in range(len(hits)):
                    rank = i + 1
                    doc_id = hits[i].docid
                    score = hits[i].score
                    all_results_list.append(f"{query_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}\n")

            except Exception as e_query:
                print(f"\nErreur lors de la recherche pour QID {query_id} avec {run_tag}: {e_query}")
                # Continue avec la requête suivante

        # Écrire les résultats dans le fichier de run TREC
        with open(output_run_file, 'w') as f_out:
           f_out.writelines(all_results_list)

        end_time = time.time()
        print(f"Recherche SÉQUENTIELLE terminée pour {run_tag}. Résultats sauvegardés dans {output_run_file}")
        print(f"Temps écoulé: {end_time - start_time:.2f} secondes.\n")

    except Exception as e_main:
        print(f"\nERREUR MAJEURE pendant l'exécution de {run_tag}: {e_main}")
        print(traceback.format_exc()) # Affiche la trace complète de l'erreur
    finally:
        # Important: Fermer le searcher pour libérer les ressources Java, même en cas d'erreur
        if searcher:
             try:
                 # Note: Pyserini ne semble pas avoir de méthode close() explicite sur LuceneSearcher
                 # La JVM devrait se nettoyer, mais c'est une bonne pratique si disponible
                 # searcher.close() # Décommentez si une telle méthode existe dans votre version
                 print(f"  Nettoyage implicite des ressources pour {run_tag}.")
                 pass
             except Exception as e_close:
                 print(f"  Erreur lors de la tentative de fermeture du searcher pour {run_tag}: {e_close}")


# --- Exécution des différentes configurations (en mode séquentiel) ---
K_RESULTS = 1000 # Nombre de documents à retourner par requête

# 1. Index Baseline + Requêtes Courtes + BM25
run_file_1 = os.path.join(RUN_DIR, "baseline_short_bm25.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_1, "baseline_short")

# 2. Index Baseline + Requêtes Courtes + TF-IDF
run_file_2 = os.path.join(RUN_DIR, "baseline_short_tfidf.txt")
perform_search_sequential(queries_short, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_2, "baseline_short")

# 3. Index Baseline + Requêtes Longues + BM25
run_file_3 = os.path.join(RUN_DIR, "baseline_long_bm25.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'bm25', K_RESULTS, run_file_3, "baseline_long")

# 4. Index Baseline + Requêtes Longues + TF-IDF
run_file_4 = os.path.join(RUN_DIR, "baseline_long_tfidf.txt")
perform_search_sequential(queries_long, INDEX_DIR_BASELINE, 'tfidf', K_RESULTS, run_file_4, "baseline_long")

# --- Recherches sur l'index prétraité ---
# 5. Index Preprocessed + Requêtes Courtes (Prétraitées) + BM25
run_file_5 = os.path.join(RUN_DIR, "preproc_short_bm25.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_5, "preproc_short", use_preprocessed_query=False)

# 6. Index Preprocessed + Requêtes Courtes (Prétraitées) + TF-IDF
run_file_6 = os.path.join(RUN_DIR, "preproc_short_tfidf.txt")
perform_search_sequential(queries_short_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_6, "preproc_short", use_preprocessed_query=False)

# 7. Index Preprocessed + Requêtes Longues (Prétraitées) + BM25
run_file_7 = os.path.join(RUN_DIR, "preproc_long_bm25.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'bm25', K_RESULTS, run_file_7, "preproc_long", use_preprocessed_query=False)

# 8. Index Preprocessed + Requêtes Longues (Prétraitées) + TF-IDF
run_file_8 = os.path.join(RUN_DIR, "preproc_long_tfidf.txt")
perform_search_sequential(queries_long_preprocessed, INDEX_DIR_PREPROC, 'tfidf', K_RESULTS, run_file_8, "preproc_long", use_preprocessed_query=False)

print("Toutes les recherches de base (mode séquentiel) sont terminées.")

# --- Note importante ---
# Si cette cellule s'exécute sans planter (même si c'est lent),
# le problème est probablement lié à la parallélisation (mémoire/conflits JVM).
# Si elle plante encore, surtout lors des runs 'tfidf',
# le problème pourrait être lié à ClassicSimilarity ou à l'environnement Java lui-même.


# --- Nouvelle Cellule ---

!pip install pyserini

# --- Nouvelle Cellule ---

# === Cellule 1.1: Fonction de Prétraitement ===
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Applique la tokenisation, la mise en minuscule, la suppression
    de la ponctuation, la suppression des stop words et la lemmatisation.
    """
    # Tokenisation et minuscules
    tokens = word_tokenize(text.lower())

    # Suppression ponctuation et mots non alphabétiques + stop words
    filtered_tokens = [
        lemmatizer.lemmatize(w) for w in tokens
        if w.isalpha() and w not in stop_words # Garde seulement les mots alphabétiques non-stop words
    ]

    # Rejoint les tokens en une chaîne de caractères
    return ' '.join(filtered_tokens)

# Exemple d'utilisation
sample_text = "This is an example showing Information Retrieval with lemmatization and stop words removal."
preprocessed_sample = preprocess_text(sample_text)
print(f"Original: {sample_text}")
print(f"Preprocessed: {preprocessed_sample}")

# --- Nouvelle Cellule ---

import nltk
nltk.download('punkt_tab')

# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin dans la Cellule 0.1 et 0.3.")

# Corrected the path for AP_TAR_PATH by removing the extra DRIVE_PROJECT_PATH
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "topics/") #Corrected the path
QRELS_DIR = os.path.join

# --- Nouvelle Cellule ---

# === Cellule 0.3: Définir les chemins ===
# !!! ADAPTEZ CE CHEMIN VERS VOTRE DOSSIER SUR GOOGLE DRIVE !!!
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Projet_RI"

# Vérification que le chemin existe
if not os.path.exists(DRIVE_PROJECT_PATH):
    raise FileNotFoundError(f"Le chemin spécifié n'existe pas : {DRIVE_PROJECT_PATH}. Vérifiez le chemin dans la Cellule 0.1 et 0.3.")

# Corrected the path for AP_TAR_PATH by removing the extra DRIVE_PROJECT_PATH
AP_TAR_PATH = os.path.join(DRIVE_PROJECT_PATH, "AP.tar") # Assumant que c'est un .tar.gz, sinon ajustez
TOPICS_DIR = os.path.join(DRIVE_PROJECT_PATH, "topics/") #Corrected the path
QRELS_DIR = os.path.join(DRIVE_PROJECT_PATH, "ql/") #Corrected the path


# Chemins pour les sorties (index, résultats, etc.) dans l'environnement Colab
OUTPUT_DIR = os.path.join(DRIVE_PROJECT_PATH, "output/") #Corrected the path
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "baseline") #Corrected the path
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "pre") #Corrected the path
CORPUS_DIR = os.path.join(OUTPUT_DIR, "Corpus") # Pour les documents extraits/formatés
RUN_DIR = os.path.join(OUTPUT_DIR, "runs") # Pour les fichiers de résultats TREC
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval") # Pour les fichiers d'évaluation

# Créer les répertoires de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR_BASELINE, exist_ok=True)
os.makedirs(INDEX_DIR_PREPROC, exist_ok=True)
os.makedirs(CORPUS_DIR, exist_ok=True)
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

print(f"Chemin du projet Drive: {DRIVE_PROJECT_PATH}")
print(f"Répertoire de sortie Colab: {OUTPUT_DIR}")

# --- Nouvelle Cellule ---

with open(JSONL_OUTPUT_PATH, 'w') as outfile, tarfile.open(AP_TAR_PATH, "r:") as tar:  # Changed mode to "r"

Pour citer ce code :

Loyer, Dominique. (2024). v1832.ipynb [Code source]. Repris de https://dominiqueloyer.github.io/Codes.html

vf2300-at-2025-07-05T22_24_15.381Z-pinned.ttl

Erreur lors de la génération de la description.

Mots-clés: erreur, api

#################################################################
#    Ontologie pour la Gestion des Subventions de Recherche
#    Basée sur le document TravailOnto-e1-SubvRech.pdf (DIC9335 Hiver 2025)
#    Formatage d'en-tête basé sur bon format.txt, contenu complet.
#    Syntaxe: Turtle (TTL) - Correction des listes intersectionOf
#################################################################

#################################################################
#    Préfixes et Base IRI (Style bon format.txt)
#################################################################

@prefix : <http://www.dic9335.uqam.ca/ontologies/subvention-recherche#> .
@base <http://www.dic9335.uqam.ca/ontologies/subvention-recherche> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

#################################################################
#    Métadonnées de l'Ontologie (Style bon format.txt)
#################################################################

<http://www.dic9335.uqam.ca/ontologies/subvention-recherche> rdf:type owl:Ontology ;
  rdfs:comment "Ontologie modélisant les concepts liés à la gestion des concours de subventions à la recherche, basée sur l'énoncé du travail DIC9335 Hiver 2025. Format compact, contenu complet."@fr ;
  rdfs:label "Ontologie Subvention Recherche (Compacte)"@fr ;
  owl:versionInfo "1.3.1"^^xsd:string . # Version incrémentée pour correction

#################################################################
#    Déclaration des Classes (Concepts) - Étape 1
#################################################################

:Concours rdf:type owl:Class ;
  rdfs:label "Concours de subvention"@fr ;
  rdfs:comment "Représente un événement de mise en compétition pour l'obtention de financement de recherche."@fr .

:AppelAPropositions rdf:type owl:Class ;
  rdfs:label "Appel à propositions"@fr ;
  rdfs:comment "Document annonçant un concours, ses conditions et ses dates importantes."@fr .

:PropositionDeProjet rdf:type owl:Class ;
  rdfs:label "Proposition de projet"@fr ;
  rdfs:comment "Document soumis par un ou des chercheurs décrivant un projet de recherche proposé pour financement."@fr .

:Evaluation rdf:type owl:Class ;
  rdfs:label "Évaluation"@fr ;
  rdfs:comment "Processus ou résultat de l'appréciation d'une proposition par un évaluateur ou un comité."@fr .

:EvaluationIndividuelle rdf:type owl:Class ;
  rdfs:subClassOf :Evaluation ;
  rdfs:label "Évaluation individuelle"@fr ;
  rdfs:comment "Évaluation d'une proposition réalisée par un seul membre évaluateur."@fr .

:EvaluationSommaire rdf:type owl:Class ;
  rdfs:subClassOf :Evaluation ;
  rdfs:label "Évaluation sommaire"@fr ;
  rdfs:comment "Synthèse des évaluations individuelles pour une proposition, établissant une note unique par critère."@fr .

:RecommandationComite rdf:type owl:Class ;
  rdfs:label "Recommandation du comité"@fr ;
  rdfs:comment "Décision suggérée par le comité d'évaluation concernant une proposition (ex: financement, rejet)."@fr .

:DecisionFinancement rdf:type owl:Class ;
  rdfs:label "Décision de financement"@fr ;
  rdfs:comment "Décision finale quant à l'acceptabilité et au niveau de financement d'une proposition."@fr .

:DisciplineDeRecherche rdf:type owl:Class ;
  rdfs:label "Discipline de recherche"@fr ;
  rdfs:comment "Domaine scientifique ou académique concerné par un concours ou une proposition."@fr .

:CritereEvaluation rdf:type owl:Class ;
  rdfs:label "Critère d'évaluation"@fr ;
  rdfs:comment "Aspect spécifique selon lequel une proposition est évaluée (ex: excellence, potentiel, retombées)."@fr .

:NoteEvaluation rdf:type owl:Class ;
  rdfs:label "Note d'évaluation"@fr ;
  rdfs:comment "Valeur qualitative ou quantitative assignée lors d'une évaluation (ex: A, B, C)."@fr .

:Chercheur rdf:type owl:Class ;
  rdfs:label "Chercheur"@fr ;
  rdfs:comment "Personne menant des activités de recherche."@fr .

:ChercheurJunior rdf:type owl:Class ;
  rdfs:subClassOf :Chercheur ;
  rdfs:label "Chercheur junior"@fr .

:ChercheurEtabli rdf:type owl:Class ;
  rdfs:subClassOf :Chercheur ;
  rdfs:label "Chercheur établi"@fr .

:ChercheurSenior rdf:type owl:Class ;
  rdfs:subClassOf :Chercheur ;
  rdfs:label "Chercheur sénior"@fr .

[ rdf:type owl:AllDisjointClasses ;
  owl:members ( :ChercheurJunior :ChercheurEtabli :ChercheurSenior )
] .

:PorteurDeProjet rdf:type owl:Class ;
  rdfs:subClassOf :Chercheur ;
  rdfs:label "Porteur de projet"@fr ;
  rdfs:comment "Chercheur principal responsable de la soumission d'une proposition."@fr .

:ComiteEvaluation rdf:type owl:Class ;
  rdfs:label "Comité d'évaluation"@fr ;
  rdfs:comment "Groupe de chercheurs chargé d'évaluer les propositions pour une ou plusieurs disciplines."@fr .

:MembreComite rdf:type owl:Class ;
  rdfs:subClassOf :Chercheur ;
  rdfs:label "Membre de comité"@fr ;
  rdfs:comment "Chercheur (sénior ou établi) faisant partie d'un comité d'évaluation."@fr .

:PresidentComite rdf:type owl:Class ;
  rdfs:subClassOf :ChercheurSenior ;
  rdfs:subClassOf :MembreComite ;
  rdfs:label "Président de comité"@fr ;
  rdfs:comment "Chercheur sénior dirigeant un comité d'évaluation."@fr .

:StatutProposition rdf:type owl:Class ;
  rdfs:label "Statut de proposition"@fr ;
  rdfs:comment "Classification d'une proposition suite à l'évaluation (Catégorie I, II, ou III)."@fr .

:Categorie_I rdf:type owl:Class ;
  rdfs:subClassOf :StatutProposition ;
  rdfs:label "Catégorie I"@fr ;
  rdfs:comment "Proposition recommandée pour financement."@fr .

:Categorie_II rdf:type owl:Class ;
  rdfs:subClassOf :StatutProposition ;
  rdfs:label "Catégorie II"@fr ;
  rdfs:comment "Proposition recommandée pour financement conditionnel."@fr .

:Categorie_III rdf:type owl:Class ;
  rdfs:subClassOf :StatutProposition ;
  rdfs:label "Catégorie III"@fr ;
  rdfs:comment "Proposition recommandée pour rejet."@fr .

[ rdf:type owl:AllDisjointClasses ;
  owl:members ( :Categorie_I :Categorie_II :Categorie_III )
] .

:NoteAttribuee rdf:type owl:Class ;
  rdfs:label "Note attribuée"@fr ;
  rdfs:comment "Représente l'attribution d'une note spécifique à un critère spécifique dans le cadre d'une évaluation."@fr .

:PropositionDeCategorieI rdf:type owl:Class ;
  rdfs:comment "Classe définissant les conditions pour être Catégorie I."@fr .

:PropositionDeCategorieII rdf:type owl:Class ;
  rdfs:comment "Classe définissant les conditions pour être Catégorie II."@fr .

:PropositionDeCategorieIII rdf:type owl:Class ;
  rdfs:comment "Classe définissant les conditions pour être Catégorie III."@fr .

#################################################################
#    Déclaration des Propriétés d'Objet (Relations) - Étape 1
#################################################################

:annonceConcours rdf:type owl:ObjectProperty ;
  rdfs:domain :AppelAPropositions ;
  rdfs:range :Concours ;
  rdfs:label "annonce concours"@fr ;
  rdfs:comment "Lie un appel à propositions au concours qu'il annonce."@fr .

:estAnnoncePar rdf:type owl:ObjectProperty ;
  owl:inverseOf :annonceConcours ;
  rdfs:domain :Concours ;
  rdfs:range :AppelAPropositions ;
  rdfs:label "est annoncé par"@fr .

:concerneDiscipline rdf:type owl:ObjectProperty ;
  rdfs:label "concerne discipline"@fr ;
  rdfs:comment "Lie un concours, un comité ou une proposition à une discipline de recherche."@fr .

:soumetProposition rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :PorteurDeProjet ;
  rdfs:range :PropositionDeProjet ;
  rdfs:label "soumet proposition"@fr .

:estSoumisePar rdf:type owl:ObjectProperty ;
  rdf:type owl:InverseFunctionalProperty ;
  owl:inverseOf :soumetProposition ;
  rdfs:domain :PropositionDeProjet ;
  rdfs:range :PorteurDeProjet ;
  rdfs:label "est soumise par"@fr .

:aPourAuteur rdf:type owl:ObjectProperty ;
  rdfs:domain :PropositionDeProjet ;
  rdfs:range :Chercheur ;
  rdfs:label "a pour auteur"@fr ;
  rdfs:comment "Lie une proposition à un de ses auteurs (chercheurs)."@fr .

:estEvalueeParComite rdf:type owl:ObjectProperty ;
  rdfs:domain :PropositionDeProjet ;
  rdfs:range :ComiteEvaluation ;
  rdfs:label "est évaluée par comité"@fr .

:evalueProposition rdf:type owl:ObjectProperty ;
  owl:inverseOf :estEvalueeParComite ;
  rdfs:domain :ComiteEvaluation ;
  rdfs:range :PropositionDeProjet ;
  rdfs:label "évalue proposition"@fr .

:aPourMembre rdf:type owl:ObjectProperty ;
  rdfs:domain :ComiteEvaluation ;
  rdfs:range :MembreComite ;
  rdfs:label "a pour membre"@fr .

:estMembreDe rdf:type owl:ObjectProperty ;
  owl:inverseOf :aPourMembre ;
  rdfs:domain :MembreComite ;
  rdfs:range :ComiteEvaluation ;
  rdfs:label "est membre de"@fr .

:aPourPresident rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :ComiteEvaluation ;
  rdfs:range :PresidentComite ;
  rdfs:label "a pour président"@fr .

:presideComite rdf:type owl:ObjectProperty ;
  owl:inverseOf :aPourPresident ;
  rdfs:domain :PresidentComite ;
  rdfs:range :ComiteEvaluation ;
  rdfs:label "préside comité"@fr .

:assigneEvaluationA rdf:type owl:ObjectProperty ;
  rdfs:domain :PresidentComite ;
  rdfs:range :MembreComite ;
  rdfs:label "assigne évaluation à"@fr ;
  rdfs:comment "Lie la personne qui assigne (président?) à l'évaluateur assigné pour une proposition donnée (nécessite peut-être une classe d'assignation)."@fr .

:realiseEvaluationIndividuelle rdf:type owl:ObjectProperty ;
  rdfs:domain :MembreComite ;
  rdfs:range :EvaluationIndividuelle ;
  rdfs:label "réalise évaluation individuelle"@fr .

:concerneProposition rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :Evaluation ;
  rdfs:range :PropositionDeProjet ;
  rdfs:label "concerne proposition"@fr .

:genereEvaluationSommaire rdf:type owl:ObjectProperty ;
  rdfs:domain :ComiteEvaluation ;
  rdfs:range :EvaluationSommaire ;
  rdfs:label "génère évaluation sommaire"@fr .

:estBaseSur rdf:type owl:ObjectProperty ;
  rdfs:domain :EvaluationSommaire ;
  rdfs:range :EvaluationIndividuelle ;
  rdfs:label "est basé sur"@fr ;
  rdfs:comment "Lie une évaluation sommaire aux évaluations individuelles qui la composent."@fr .

:emetRecommandation rdf:type owl:ObjectProperty ;
  rdfs:domain :ComiteEvaluation ;
  rdfs:range :RecommandationComite ;
  rdfs:label "émet recommandation"@fr .

:aPourRecommandation rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :PropositionDeProjet ;
  rdfs:range :RecommandationComite ;
  rdfs:label "a pour recommandation"@fr .

:aPourNoteCritere rdf:type owl:ObjectProperty ;
  rdfs:domain :Evaluation ;
  rdfs:range :NoteAttribuee ;
  rdfs:label "a pour note critère"@fr .

:concerneCritere rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :NoteAttribuee ;
  rdfs:range :CritereEvaluation ;
  rdfs:label "concerne critère"@fr .

:aValeurNote rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :NoteAttribuee ;
  rdfs:range :NoteEvaluation ;
  rdfs:label "a valeur note"@fr .

:determineStatut rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :RecommandationComite ;
  rdfs:range :StatutProposition ;
  rdfs:label "détermine statut"@fr .

:aPourStatut rdf:type owl:ObjectProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :PropositionDeProjet ;
  rdfs:range :StatutProposition ;
  rdfs:label "a pour statut"@fr .

#################################################################
#    Déclaration des Propriétés de Données (Attributs) - Étape 1
#################################################################

:dateLimiteSoumission rdf:type owl:DatatypeProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :AppelAPropositions ;
  rdfs:range xsd:date ;
  rdfs:label "date limite de soumission"@fr .

:dateReponseAttendue rdf:type owl:DatatypeProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :AppelAPropositions ;
  rdfs:range xsd:date ;
  rdfs:label "date de réponse attendue"@fr .

:titreProposition rdf:type owl:DatatypeProperty ;
  rdf:type owl:FunctionalProperty ;
  rdfs:domain :PropositionDeProjet ;
  rdfs:range xsd:string ;
  rdfs:label "titre de la proposition"@fr .

:nomChercheur rdf:type owl:DatatypeProperty ;
  rdfs:domain :Chercheur ;
  rdfs:range xsd:string ;
  rdfs:label "nom du chercheur"@fr .

#################################################################
#    Déclaration des Individus (Instances)
#################################################################

:Note_A rdf:type owl:NamedIndividual ,
                 :NoteEvaluation ;
  rdfs:label "Excellent"@fr ;
  rdfs:comment "Correspond à la note littérale A."@fr .

:Note_B rdf:type owl:NamedIndividual ,
                 :NoteEvaluation ;
  rdfs:label "Très Bon"@fr ;
  rdfs:comment "Correspond à la note littérale B."@fr .

:Note_C rdf:type owl:NamedIndividual ,
                 :NoteEvaluation ;
  rdfs:label "Bon"@fr ;
  rdfs:comment "Correspond à la note littérale C."@fr .

:Critere_ExcellenceDossier rdf:type owl:NamedIndividual ,
                                   :CritereEvaluation ;
  rdfs:label "Excellence du ou des dossiers de chercheur"@fr .

:Critere_PotentielProposition rdf:type owl:NamedIndividual ,
                                      :CritereEvaluation ;
  rdfs:label "Potentiel de la proposition en terme de contribution"@fr .

:Critere_FormationPersonnel rdf:type owl:NamedIndividual ,
                                    :CritereEvaluation ;
  rdfs:label "Possibilité de former du personnel hautement qualifié"@fr .

:Critere_RetombeesSocioEco rdf:type owl:NamedIndividual ,
                                   :CritereEvaluation ;
  rdfs:label "Retombées socio-économiques"@fr .

:Recommandation_Financement rdf:type owl:NamedIndividual ,
                                    :RecommandationComite ;
  rdfs:label "Financement"@fr .

:Recommandation_Conditionnel rdf:type owl:NamedIndividual ,
                                     :RecommandationComite ;
  rdfs:label "Financement conditionnel"@fr .

:Recommandation_Rejet rdf:type owl:NamedIndividual ,
                               :RecommandationComite ;
  rdfs:label "Rejet"@fr .

#################################################################
#    Axiomes et Restrictions - Étape 1 (Exemples)
#################################################################

:ComiteEvaluation
  rdfs:subClassOf [ rdf:type owl:Restriction ;
                    owl:onProperty :aPourPresident ;
                    owl:cardinality "1"^^xsd:nonNegativeInteger
                  ] ;
  rdfs:subClassOf [ rdf:type owl:Restriction ;
                    owl:onProperty :aPourMembre ;
                    owl:minCardinality "4"^^xsd:nonNegativeInteger ;
                    owl:maxCardinality "8"^^xsd:nonNegativeInteger
                  ] ;
  rdfs:subClassOf [ rdf:type owl:Restriction ;
                    owl:onProperty :aPourMembre ;
                    owl:allValuesFrom [ rdf:type owl:Class ;
                                        owl:unionOf ( :ChercheurSenior
                                                      :ChercheurEtabli
                                                    )
                                      ]
                  ] .

#################################################################
#    Axiomes pour Classification Automatique - Étape 2
#################################################################

:PropositionDeCategorieI rdf:type owl:Class ;
  owl:equivalentClass [ rdf:type owl:Class ;
                        owl:intersectionOf ( :PropositionDeProjet
                                             [ rdf:type owl:Restriction ;
                                               owl:onProperty :aPourEvaluationSommaire ;
                                               owl:someValuesFrom [ rdf:type owl:Restriction ;
                                                                    owl:onProperty :aPourNoteCritere ;
                                                                    owl:someValuesFrom [ rdf:type owl:Restriction ;
                                                                                         owl:onProperty :aValeurNote ;
                                                                                         owl:hasValue :Note_A
                                                                                       ] ;
                                                                    owl:minCardinality "3"^^xsd:nonNegativeInteger
                                                                  ]
                                             ]
                                           )
                      ] .

:PropositionDeCategorieII rdf:type owl:Class ;
  owl:equivalentClass [ rdf:type owl:Class ;
                        owl:intersectionOf ( :PropositionDeProjet
                                             [ rdf:type owl:Restriction ; # Main restriction on EvalSommaire
                                               owl:onProperty :aPourEvaluationSommaire ;
                                               owl:someValuesFrom [ rdf:type owl:Class ; # Class expression for EvalSommaire properties
                                                                    owl:intersectionOf ( :EvaluationSommaire
                                                                                       [ rdf:type owl:Restriction ; # Restriction for Note B
                                                                                         owl:onProperty :aPourNoteCritere ;
                                                                                         owl:someValuesFrom [ rdf:type owl:Restriction ;
                                                                                                              owl:onProperty :aValeurNote ;
                                                                                                              owl:hasValue :Note_B
                                                                                                            ] ;
                                                                                         owl:minCardinality "2"^^xsd:nonNegativeInteger
                                                                                       ] # End Restriction Note B - REMOVED INCORRECT SEMICOLON
                                                                                       [ rdf:type owl:Restriction ; # Restriction for Note A
                                                                                         owl:onProperty :aPourNoteCritere ;
                                                                                         owl:someValuesFrom [ rdf:type owl:Restriction ;
                                                                                                              owl:onProperty :aValeurNote ;
                                                                                                              owl:hasValue :Note_A
                                                                                                            ] ;
                                                                                         owl:minCardinality "1"^^xsd:nonNegativeInteger
                                                                                       ] # End Restriction Note A
                                                                                     ) # End inner intersectionOf
                                                                  ] # End Class expression for EvalSommaire properties
                                             ] # End Main restriction on EvalSommaire
                                             [ rdf:type owl:Class ; # Complement of Cat I
                                               owl:complementOf :PropositionDeCategorieI
                                             ] # End Complement of Cat I
                                           ) # End outer intersectionOf
                      ] . # End equivalentClass

:PropositionDeCategorieIII rdf:type owl:Class ;
  owl:equivalentClass [ rdf:type owl:Class ;
                        owl:intersectionOf ( :PropositionDeProjet
                                             [ rdf:type owl:Restriction ; # Main Restriction
                                               owl:onProperty :aPourEvaluationSommaire ;
                                               owl:someValuesFrom [ rdf:type owl:Class ; # ClassExpr for EvalSommaire
                                                                    owl:intersectionOf ( :EvaluationSommaire
                                                                                       [ rdf:type owl:Restriction ; # Note C Restriction
                                                                                         owl:onProperty :aPourNoteCritere ;
                                                                                         owl:someValuesFrom [ rdf:type owl:Restriction ;
                                                                                                              owl:onProperty :aValeurNote ;
                                                                                                              owl:hasValue :Note_C
                                                                                                            ] ;
                                                                                         owl:minCardinality "2"^^xsd:nonNegativeInteger
                                                                                       ] # End Note C Restriction
                                                                                     ) # End inner intersectionOf
                                                                  ] # End ClassExpr for EvalSommaire
                                             ] # End Main Restriction
                                             [ rdf:type owl:Class ; # Complement Cat I
                                               owl:complementOf :PropositionDeCategorieI
                                             ] # End Complement Cat I
                                             [ rdf:type owl:Class ; # Complement Cat II
                                               owl:complementOf :PropositionDeCategorieII
                                             ] # End Complement Cat II - REMOVED INCORRECT SEMICOLONS BEFORE THESE COMPLEMENTS
                                           ) # End outer intersectionOf
                      ] . # End equivalentClass axiom

:PropositionDeCategorieI rdfs:subClassOf [ rdf:type owl:Restriction ;
                                             owl:onProperty :aPourStatut ;
                                             owl:allValuesFrom :Categorie_I
                                           ] .
:PropositionDeCategorieII rdfs:subClassOf [ rdf:type owl:Restriction ;
                                              owl:onProperty :aPourStatut ;
                                              owl:allValuesFrom :Categorie_II
                                            ] .
:PropositionDeCategorieIII rdfs:subClassOf [ rdf:type owl:Restriction ;
                                               owl:onProperty :aPourStatut ;
                                               owl:allValuesFrom :Categorie_III
                                             ] .

# --- Fin de l'ontologie ---