| import os |
| import faiss |
| import pickle |
| import numpy as np |
| import openai |
| import tiktoken |
| from dotenv import load_dotenv |
| from openai import OpenAI |
| from pathlib import Path |
| from huggingface_hub import hf_hub_download |
|
|
| |
| CACHE_DIR = os.getenv("CACHE_DIR", "/tmp/cache") |
| os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
|
| |
| load_dotenv() |
| os.environ["TRANSFORMERS_CACHE"] = os.getenv("TRANSFORMERS_CACHE", "/tmp/huggingface/cache") |
| os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface") |
|
|
| |
| client = OpenAI() |
| EMBED_MODEL = "text-embedding-3-large" |
| CHAT_MODEL = "o4-mini-2025-04-16" |
| FAISS_INDEX_FILE = "tindle_index.faiss" |
| IDS_PKL = "tindle_ids.pkl" |
| CHUNKS_PKL = "tindle_chunks.pkl" |
| TOP_K = 10 |
| MAX_TOKENS_CONTEXT = 4000 |
| SYSTEM_PROMPT = ( |
| "Tu es un assistant expert en droit fiscal. " |
| "Fais d'abord appel aux passages fournis pour répondre. " |
| "Si ces passages sont insuffisants, utilise tes connaissances générales en le précisant clairement." |
| ) |
|
|
| |
|
|
| |
| index_path = hf_hub_download(repo_id="Jordanche/fiscarag", filename=FAISS_INDEX_FILE, repo_type="dataset", cache_dir=CACHE_DIR) |
| ids_path = hf_hub_download(repo_id="Jordanche/fiscarag", filename=IDS_PKL, repo_type="dataset" ,cache_dir=CACHE_DIR) |
| chunks_path = hf_hub_download(repo_id="Jordanche/fiscarag", filename=CHUNKS_PKL, repo_type="dataset",cache_dir=CACHE_DIR) |
|
|
| |
| index = faiss.read_index(index_path) |
| with open(ids_path, "rb") as f: |
| ids = pickle.load(f) |
| with open(chunks_path, "rb") as f: |
| chunks_dict = pickle.load(f) |
|
|
| |
| enc = tiktoken.get_encoding("cl100k_base") |
| def num_tokens(s: str) -> int: |
| return len(enc.encode(s)) |
|
|
|
|
| |
|
|
| def embed_question(question: str) -> list[float]: |
| resp = client.embeddings.create( |
| model=EMBED_MODEL, |
| input=[question] |
| ) |
| |
| return resp.data[0].embedding |
|
|
|
|
| def retrieve_chunks(q_emb: list[float], k: int = TOP_K): |
| xq = np.array([q_emb], dtype="float32") |
| distances, indices = index.search(xq, k) |
| out = [] |
| for dist, idx in zip(distances[0], indices[0]): |
| cid = ids[idx] |
| meta = chunks_dict[cid] |
| out.append({ |
| "score": float(dist), |
| "id": cid, |
| "text": meta["text"], |
| "metadata": {cle: val for cle, val in meta.items() if cle != "text"} |
| }) |
| return out |
|
|
|
|
| def build_context(chunks, max_tokens=MAX_TOKENS_CONTEXT): |
| parts, tokens = [], 0 |
| for c in sorted(chunks, key=lambda x: x["score"]): |
| |
| metadata_parts = [] |
| for key, value in c["metadata"].items(): |
| metadata_parts.append(f"{key}: {value}") |
| |
| metadata_str = f" | ".join(metadata_parts) if metadata_parts else "" |
| source_info = f"(Source: {c['id']}" |
| if metadata_str: |
| source_info += f" | {metadata_str}" |
| source_info += ")" |
| |
| piece = f"{source_info} {c['text']}" |
| nt = num_tokens(piece) |
| if tokens + nt > max_tokens: |
| break |
| parts.append(piece) |
| tokens += nt |
| return "\n\n".join(parts) |
|
|
|
|
| def make_prompt(question: str, context: str): |
| return [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": f"Question: {question}\n\nContexte:\n{context}"} |
| ] |
|
|
|
|
| def answer_question(question: str, k: int = TOP_K) -> str: |
| |
| q_emb = embed_question(question) |
|
|
| |
| top_chunks = retrieve_chunks(q_emb, k) |
|
|
| |
| context = build_context(top_chunks) |
|
|
| |
| messages = make_prompt(question, context) |
| |
| resp = client.chat.completions.create( |
| model=CHAT_MODEL, |
| messages=messages ) |
| return resp.choices[0].message.content |
|
|
| |
| if __name__ == "__main__": |
| question = "Quels sont les délais pour la réhabilitation d'hôtels en outre-mer ?" |
| print(answer_question(question, k=10)) |
|
|