Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import os | |
| BASE_DIR = Path(__file__).resolve().parents[2] | |
| OUTPUT_DIR = BASE_DIR / "output" | |
| # Docs paths | |
| METADATA_FILE = BASE_DIR / "dataset" / "docs_metadata.csv" | |
| CHUNKS_FILE = OUTPUT_DIR / "documents" / "chunks.json" | |
| EMBEDDINGS_FILE = OUTPUT_DIR / "documents" / "embeddings.npy" | |
| FAISS_INDEX_FILE = OUTPUT_DIR / "documents" / "faiss_docs.bin" | |
| # Ytb paths | |
| RAW_CSV = OUTPUT_DIR / "youtube" / "parlement_transcript.csv" | |
| PROCESSED_CHUNKS_CSV = OUTPUT_DIR / "youtube" / "chunks.csv" | |
| FAISS_INDEX_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_ytb.bin" | |
| FAISS_METADATA_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_metadata.pkl" | |
| # Ensure parent directories exist | |
| (OUTPUT_DIR / "documents").mkdir(parents=True, exist_ok=True) | |
| (OUTPUT_DIR / "youtube").mkdir(parents=True, exist_ok=True) | |
| (BASE_DIR / "dataset").mkdir(parents=True, exist_ok=True) | |
| # Log directory | |
| LOG_DIR = BASE_DIR / "logs" | |
| os.makedirs(LOG_DIR, exist_ok=True) | |
| # Processing settings | |
| MAX_TOKENS = 512 # Chunk size | |
| BATCH_SIZE = 32 # Embedding batch size | |
| TOP_K=5 | |
| # Embedding model (multilingual for Arabic and French) | |
| YT_EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2" | |
| EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2" | |
| LLM_MODEL = "gemini-2.5-flash" # Lightweight LLM for CPU | |
| # Max docs and videos to process | |
| MAX_DOCS = 540 | |
| MAX_VIDEOS = 5 | |
| # Title paths | |
| TITLE_EMBEDDINGS_FILE = OUTPUT_DIR / "title_embd" / "title_embeddings.npy" | |
| TITLE_FAISS_INDEX_FILE = OUTPUT_DIR / "title_embd" / "faiss_titles.bin" |