from pathlib import Path from dataclasses import dataclass from typing import List import re import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity @dataclass class RetrievedChunk: text: str filename: str score: float class RAGEngineGPT2: def __init__( self, corpus_dir: str = "corpus/", chunk_size: int = 450, chunk_overlap: int = 80, min_score: float = 0.05 ): self.corpus_dir = Path(corpus_dir) self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.min_score = min_score self.chunks: List[str] = [] self.chunk_sources: List[str] = [] self.vectorizer = TfidfVectorizer( lowercase=True, strip_accents="unicode", ngram_range=(1, 2), max_features=10000 ) self._load_and_chunk_corpus() self._index() def _clean_text(self, text: str) -> str: text = re.sub(r"\s+", " ", text) return text.strip() def _clean_for_prompt(self, text: str) -> str: text = re.sub(r"\[[A-ZÉÈÊÀÙÎÏÇ _-]+\]", "", text) text = re.sub(r"\s+", " ", text) return text.strip() def _split_into_chunks(self, text: str) -> List[str]: text = self._clean_text(text) if len(text) <= self.chunk_size: return [text] chunks = [] start = 0 while start < len(text): end = start + self.chunk_size chunk = text[start:end].strip() if chunk: chunks.append(chunk) start += self.chunk_size - self.chunk_overlap return chunks def _load_and_chunk_corpus(self): if not self.corpus_dir.exists(): raise FileNotFoundError(f"Dossier corpus introuvable : {self.corpus_dir}") files = sorted( list(self.corpus_dir.glob("*.txt")) + list(self.corpus_dir.glob("*.md")) ) if not files: raise ValueError("Aucun fichier .txt ou .md trouvé dans le dossier corpus.") for file_path in files: try: text = file_path.read_text(encoding="utf-8") chunks = self._split_into_chunks(text) for chunk in chunks: self.chunks.append(chunk) self.chunk_sources.append(file_path.name) except Exception as e: print(f"Fichier ignoré : {file_path.name} | Erreur : {e}") if not self.chunks: raise ValueError("Aucun passage exploitable trouvé dans le corpus.") print(f"{len(files)} fichiers chargés.") print(f"{len(self.chunks)} passages indexés.") def _index(self): self.tfidf_matrix = self.vectorizer.fit_transform(self.chunks) def _apply_domain_boost(self, query: str, scores): query_lower = query.lower() for i in range(len(scores)): filename = self.chunk_sources[i].lower() if ("médecine" in query_lower or "medecine" in query_lower or "santé" in query_lower) and "ia_medecine" in filename: scores[i] += 0.08 if ("finance" in query_lower or "banque" in query_lower or "crédit" in query_lower or "credit" in query_lower) and "ia_finance" in filename: scores[i] += 0.08 if ("recrutement" in query_lower or "rh" in query_lower) and "ia_rh" in filename: scores[i] += 0.08 if ("éducation" in query_lower or "education" in query_lower or "école" in query_lower or "ecole" in query_lower or "élève" in query_lower or "eleve" in query_lower) and "ia_education" in filename: scores[i] += 0.08 if ("cybersécurité" in query_lower or "cybersecurite" in query_lower or "cyber" in query_lower) and "ia_cybersecurite" in filename: scores[i] += 0.08 if ("art" in query_lower or "image" in query_lower or "création" in query_lower or "creation" in query_lower) and "ia_art" in filename: scores[i] += 0.08 return scores def search(self, query: str, top_k: int = 1) -> List[RetrievedChunk]: query = self._clean_text(query) if not query: return [] q_vec = self.vectorizer.transform([query]) scores = cosine_similarity(q_vec, self.tfidf_matrix)[0] scores = self._apply_domain_boost(query, scores) top_idx = np.argsort(scores)[::-1][:top_k] results = [] for i in top_idx: score = float(scores[i]) if score >= self.min_score: results.append( RetrievedChunk( text=self.chunks[i], filename=self.chunk_sources[i], score=score ) ) return results def build_prompt(self, query: str, top_k: int = 1) -> str: results = self.search(query, top_k=top_k) if not results: return "" context = "\n\n".join( [self._clean_for_prompt(result.text) for result in results] ) return f""" Contexte : {context} Question : {query} Réponse en français : """