Coding Showcase — Portfolio de Codes

Coding Showcase

Mostly Python, R, Bash — Use at your own risk!

(code propre) RI_tp1_6avril25.ipynb

Configure un environnement complet pour la recherche d'information : installation Pyserini/Lucene, indexation, requêtes BM25 / TF-IDF / RM3, évaluation TREC.

Mots-clés : Recherche d'information · Pyserini · Indexation · BM25 · TREC

# === Étape 0 : Configuration Complète ===
import os, sys, subprocess, time, glob, re, json, nltk
from tqdm.notebook import tqdm
import traceback

print("--- Début de la Configuration Complète ---")

# [1/8] Java 21
subprocess.run("apt-get update -qq && apt-get install -y openjdk-21-jdk-headless -qq",
               shell=True, check=True, timeout=180)

# [2/8] update-alternatives
java_path = "/usr/lib/jvm/java-21-openjdk-amd64/bin/java"
if os.path.exists(java_path):
    subprocess.run(f"update-alternatives --install /usr/bin/java java {java_path} 1", shell=True)
    subprocess.run(f"update-alternatives --set java {java_path}", shell=True)
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"

# [3/8] outils build
subprocess.run("apt-get install -y build-essential cmake -qq", shell=True, check=True, timeout=180)

# [4/8] pybind11
subprocess.run(f"{sys.executable} -m pip install pybind11 -q", shell=True, check=True, timeout=60)

# [5/8] Pyserini + NLTK + pytrec_eval
subprocess.run(f"{sys.executable} -m pip install pyserini nltk pytrec_eval",
               shell=True, check=True, timeout=600)

# [6/8] NLTK ressources
for res in ["wordnet","stopwords","punkt","omw-1.4","punkt_tab"]:
    nltk.download(res, quiet=True)

# [7/8] Chemins
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Projet_RI/TREC"
OUTPUT_DIR         = "/content/ap_output"
CORPUS_DIR         = os.path.join(OUTPUT_DIR, "corpus")
INDEX_DIR_BASELINE = os.path.join(OUTPUT_DIR, "indexes/baseline")
INDEX_DIR_PREPROC  = os.path.join(OUTPUT_DIR, "indexes/preprocessed")
RUN_DIR            = os.path.join(OUTPUT_DIR, "runs")
EVAL_DIR           = os.path.join(OUTPUT_DIR, "eval")
for d in [OUTPUT_DIR,CORPUS_DIR,INDEX_DIR_BASELINE,INDEX_DIR_PREPROC,RUN_DIR,EVAL_DIR]:
    os.makedirs(d, exist_ok=True)

# [8/8] Prétraitement
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
stop_words_set = set(stopwords.words("english"))
lemmatizer     = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str): return ""
    tokens = word_tokenize(text.lower())
    return " ".join(lemmatizer.lemmatize(w) for w in tokens
                    if w.isalpha() and w not in stop_words_set)

print("--- Configuration Complète Terminée ---")

1mars2018.R

Chargement, nettoyage et modélisation de données Census. Arbre de décision (rpart) et réseau neuronal (nnet) pour prédire les groupes de revenus.

Mots-clés : R · Arbre de décision · Réseau neuronal · Census · Prédiction

## Auteur : Dominique Loyer
## Questions 1-2 : Exploration des données Census

Census <- read.csv("~/Desktop/CensusClean.csv")
summary(Census)
str(Census)

pwc <- readxl::read_excel("Census.xlsx")

master        <- subset(pwc, Education %in% "Masters")
master50kplus <- subset(master, `Income Group` %in% ">50K")
dim(master50kplus)

doctorate        <- subset(pwc, Education %in% "Doctorate")
doctorate50kplus <- subset(doctorate, `Income Group` %in% ">50K")
dim(doctorate50kplus)

## Question 3 : Nettoyage et normalisation
pwcClean$workclass      <- as.factor(pwcClean$workclass)
pwcClean$`Income Group` <- as.factor(pwcClean$`Income Group`)
pwcClean$Education      <- as.factor(pwcClean$Education)
pwcClean$Gender         <- as.factor(pwcClean$Gender)

normalize <- function(x) (x - min(x)) / (max(x) - min(x))
pwcClean$Age                      <- normalize(pwcClean$Age)
pwcClean$`Demographic Adjustment` <- normalize(pwcClean$`Demographic Adjustment`)
pwcClean$`capital-gain`           <- normalize(pwcClean$`capital-gain`)
pwcClean$`hours-per-week`         <- normalize(pwcClean$`hours-per-week`)

## Arbre de décision (rpart)
library(rpart); library(rpart.plot)
mytreeadult <- rpart(`Income Group` ~ Education + workclass + Gender +
                     Age + `hours-per-week`,
                     data=pwcClean, method="class",
                     control=rpart.control(minsplit=1))
rpart.plot(mytreeadult, type=3, extra=101, fallen.leaves=TRUE)

## Partition train / validation
index     <- sample(1:nrow(pwcClean), 20000)
pwc.test  <- pwcClean[ index,]
pwc.valid <- pwcClean[-index,]

## Réseau neuronal nnet (10 neurones cachés)
library(nnet)
set.seed(99999999)
pwc.net <- nnet(`Income Group` ~ ., data=pwc.test, size=10)
pwc.valid$est.income <- predict(pwc.net, pwc.valid, type="class")
table(pwc.valid$`Income Group`, pwc.valid$est.income)

ARIMA.R

Série chronologique de ventes : décomposition saisonnière, modèle ARIMA automatique et prévisions à 20 périodes.

Mots-clés : R · ARIMA · Séries temporelles · Prévision · forecast

library(forecast)

sales <- read.csv("sales.csv")
sales <- ts(sales[,1], start=1995, freq=12)

# Décomposition saisonnière
apts <- decompose(sales)
plot(apts)

# Modèle ARIMA automatique
arima_model <- auto.arima(sales)
arima_model
accuracy(arima_model)

# Prévision 20 périodes
plot(forecast(arima_model, 20))

neural_mt_en_ru.py

Traduction automatique neuronale Anglais → Russe avec Helsinki-NLP/opus-mt. Évaluation BLEU via sacrebleu.

Mots-clés : NMT · Helsinki-NLP · MarianMT · BLEU · HuggingFace

from transformers import MarianMTModel, MarianTokenizer
import sacrebleu

MODEL_NAME = "Helsinki-NLP/opus-mt-en-ru"
tokenizer  = MarianTokenizer.from_pretrained(MODEL_NAME)
model      = MarianMTModel.from_pretrained(MODEL_NAME)

def translate(texts: list) -> list:
    inputs = tokenizer(texts, return_tensors="pt",
                       padding=True, truncation=True, max_length=512)
    translated = model.generate(**inputs)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Exemple
src = ["The quick brown fox jumps over the lazy dog.",
       "Artificial intelligence is transforming research."]
tgt = translate(src)
for s, t in zip(src, tgt):
    print(f"EN: {s}")
    print(f"RU: {t}
")

# Score BLEU
refs  = [["&Bystryj korichnevyj lis prygaet cherez lenivoyu sobaku."]]
hyps  = [tgt[0]]
bleu  = sacrebleu.corpus_bleu(hyps, refs)
print(f"BLEU score : {bleu.score:.2f}")

sysCRED — systemFactChecking (extrait)

Système neuro-symbolique d'évaluation de la crédibilité de l'information. Règles de prédicats ontologiques + module ML.

Mots-clés : Neuro-symbolic AI · Fact-checking · Credibility · OWL · Python

"""
sysCRED - systemFactChecking v2.2.1
DOI : 10.5281/zenodo.18436691
"""
from owlready2 import get_ontology, sync_reasoner_pellet
import numpy as np

ONTO_PATH = "syscred_ontology.owl"
onto = get_ontology(ONTO_PATH).load()

def evaluate_source_credibility(source_uri: str) -> dict:
    """
    Évalue la crédibilité d'une source via règles de prédicats OWL
    et module de scoring neuro-symbolique.
    """
    with onto:
        sync_reasoner_pellet(infer_property_values=True)

    source = onto.search_one(iri=source_uri)
    if source is None:
        return {"error": "Source non trouvée dans l'ontologie"}

    scores = {
        "authority"    : getattr(source, "hasAuthorityScore",  [0])[0],
        "accuracy"     : getattr(source, "hasAccuracyScore",   [0])[0],
        "transparency" : getattr(source, "hasTransparency",    [0])[0],
        "bias"         : getattr(source, "hasBiasScore",       [0])[0],
    }
    credibility = float(np.mean(list(scores.values())))
    return {
        "source"      : source_uri,
        "scores"      : scores,
        "credibility" : round(credibility, 3),
        "label"       : "CREDIBLE" if credibility >= .65 else "UNCERTAIN"
    }

if __name__ == "__main__":
    result = evaluate_source_credibility("http://syscred.uqam.ca/sources#Reuters")
    print(result)