File size: 1,499 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16c7847
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from pathlib import Path
import os

BASE_DIR = Path(__file__).resolve().parents[2]
OUTPUT_DIR = BASE_DIR / "output"

# Docs paths
METADATA_FILE = BASE_DIR / "dataset" / "docs_metadata.csv"

CHUNKS_FILE = OUTPUT_DIR / "documents" / "chunks.json"
EMBEDDINGS_FILE = OUTPUT_DIR / "documents" / "embeddings.npy"
FAISS_INDEX_FILE = OUTPUT_DIR / "documents" / "faiss_docs.bin"

# Ytb paths
RAW_CSV = OUTPUT_DIR / "youtube" / "parlement_transcript.csv"
PROCESSED_CHUNKS_CSV = OUTPUT_DIR / "youtube" / "chunks.csv"
FAISS_INDEX_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_ytb.bin"
FAISS_METADATA_FILE_YT = OUTPUT_DIR / "youtube" / "faiss_metadata.pkl"


# Ensure parent directories exist
(OUTPUT_DIR / "documents").mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "youtube").mkdir(parents=True, exist_ok=True)
(BASE_DIR / "dataset").mkdir(parents=True, exist_ok=True)


# Log directory
LOG_DIR = BASE_DIR / "logs"
os.makedirs(LOG_DIR, exist_ok=True)

# Processing settings
MAX_TOKENS = 512  # Chunk size
BATCH_SIZE = 32   # Embedding batch size
TOP_K=5

# Embedding model (multilingual for Arabic and French)
YT_EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2"
EMBEDDING_MODEL = "paraphrase-multilingual-mpnet-base-v2"
LLM_MODEL = "gemini-2.5-flash"  # Lightweight LLM for CPU

# Max docs and videos to process
MAX_DOCS = 540
MAX_VIDEOS = 5

# Title paths
TITLE_EMBEDDINGS_FILE = OUTPUT_DIR / "title_embd" / "title_embeddings.npy"
TITLE_FAISS_INDEX_FILE = OUTPUT_DIR / "title_embd" / "faiss_titles.bin"