from pathlib import Path import os BASE_DIR = Path(__file__).resolve().parents[2] OUTPUT_DIR = BASE_DIR / "output" # Docs paths METADATA_FILE = BASE_DIR / "dataset" / "docs_metadata.csv" CHUNKS_FILE = OUTPUT_DIR / "documents" / "chunks.json" EMBEDDINGS_FILE = OUTPUT_DIR / "documents" / "embeddings.npy" FAISS_INDEX_FILE = OUTPUT_DIR / "documents" / "faiss_docs.bin" # Ytb paths RAW_CSV = OUTPUT_DIR / "youtube" / "parlement_transcript.csv" PROCESSED_CHUNKS_CSV = OUTPUT_DIR / "youtube" / "chunks.csv" FAISS_INDEX_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_ytb.bin" FAISS_METADATA_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_metadata.pkl" # Ensure parent directories exist (OUTPUT_DIR / "documents").mkdir(parents=True, exist_ok=True) (OUTPUT_DIR / "youtube").mkdir(parents=True, exist_ok=True) (BASE_DIR / "dataset").mkdir(parents=True, exist_ok=True) # Log directory LOG_DIR = BASE_DIR / "logs" os.makedirs(LOG_DIR, exist_ok=True) # Processing settings MAX_TOKENS = 512 # Chunk size BATCH_SIZE = 32 # Embedding batch size TOP_K=5 # Embedding model (multilingual for Arabic and French) YT_EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2" EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2" LLM_MODEL = "gemini-2.5-flash" # Lightweight LLM for CPU # Max docs and videos to process MAX_DOCS = 540 MAX_VIDEOS = 5 # Title paths TITLE_EMBEDDINGS_FILE = OUTPUT_DIR / "title_embd" / "title_embeddings.npy" TITLE_FAISS_INDEX_FILE = OUTPUT_DIR / "title_embd" / "faiss_titles.bin"