(code propre) RI_tp1_6avril25.ipynb
Configure un environnement complet pour la recherche d'information : installation Pyserini/Lucene, indexation, requêtes BM25 / TF-IDF / RM3, évaluation TREC.
Mots-clés : Recherche d'information · Pyserini · Indexation · BM25 · TREC
# === Étape 0 : Configuration Complète ===
import os, sys, subprocess, time, glob, re, json, nltk
from tqdm.notebook import tqdm
import traceback
print("--- Début de la Configuration Complète ---")
# [1/8] Java 21
subprocess.run("apt-get update -qq && apt-get install -y openjdk-21-jdk-headless -qq",
shell=True, check=True, timeout=180)
# [2/8] update-alternatives
java_path = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path):
subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path} 1", shell=True)
subprocess.run(f"update-alternatives --set java {java_path}", shell=True)
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
# [3/8] outils build
subprocess.run("apt-get install -y build-essential cmake -qq", shell=True, check=True, timeout=180)
# [4/8] pybind11
subprocess.run(f"{sys.executable} -m pip install pybind11 -q", shell=True, check=True, timeout=60)
# [5/8] Pyserini + NLTK + pytrec_eval
subprocess.run(f"{sys.executable} -m pip install pyserini nltk pytrec_eval",
shell=True, check=True, timeout=600)
# [6/8] NLTK ressources
for res in ["wordnet","stopwords","punkt","omw-1.4","punkt_tab"]:
nltk.download(res, quiet=True)
# [7/8] Chemins
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC"
OUTPUT_DIR = "/content/ap_output"
CORPUS_DIR = os.path.join(OUTPUT_DIR, "corpus")
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
RUN_DIR = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR = os.path.join(OUTPUT_DIR, "eval")
for d in [OUTPUT_DIR,CORPUS_DIR,INDEX_DIR_BASELINE,INDEX_DIR_PREPROC,RUN_DIR,EVAL_DIR]:
os.makedirs(d, exist_ok=True)
# [8/8] Prétraitement
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
stop_words_set = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
if not isinstance(text, str): return ""
tokens = word_tokenize(text.lower())
return " ".join(lemmatizer.lemmatize(w) for w in tokens
if w.isalpha() and w not in stop_words_set)
print("--- Configuration Complète Terminée ---")