RAG_APP / src /configs /config.py
ELHACHYMI's picture
add title path
16c7847 verified
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parents[2]
OUTPUT_DIR = BASE_DIR / "output"
# Docs paths
METADATA_FILE = BASE_DIR / "dataset" / "docs_metadata.csv"
CHUNKS_FILE = OUTPUT_DIR / "documents" / "chunks.json"
EMBEDDINGS_FILE = OUTPUT_DIR / "documents" / "embeddings.npy"
FAISS_INDEX_FILE = OUTPUT_DIR / "documents" / "faiss_docs.bin"
# Ytb paths
RAW_CSV = OUTPUT_DIR / "youtube" / "parlement_transcript.csv"
PROCESSED_CHUNKS_CSV = OUTPUT_DIR / "youtube" / "chunks.csv"
FAISS_INDEX_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_ytb.bin"
FAISS_METADATA_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_metadata.pkl"
# Ensure parent directories exist
(OUTPUT_DIR / "documents").mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "youtube").mkdir(parents=True, exist_ok=True)
(BASE_DIR / "dataset").mkdir(parents=True, exist_ok=True)
# Log directory
LOG_DIR = BASE_DIR / "logs"
os.makedirs(LOG_DIR, exist_ok=True)
# Processing settings
MAX_TOKENS = 512 # Chunk size
BATCH_SIZE = 32 # Embedding batch size
TOP_K=5
# Embedding model (multilingual for Arabic and French)
YT_EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2"
EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2"
LLM_MODEL = "gemini-2.5-flash" # Lightweight LLM for CPU
# Max docs and videos to process
MAX_DOCS = 540
MAX_VIDEOS = 5
# Title paths
TITLE_EMBEDDINGS_FILE = OUTPUT_DIR / "title_embd" / "title_embeddings.npy"
TITLE_FAISS_INDEX_FILE = OUTPUT_DIR / "title_embd" / "faiss_titles.bin"