import re import unicodedata import json import numpy as np import pandas as pd import csv import re import unicodedata ARABIC_DIACRITICS = re.compile(r'[\u0617-\u061A\u064B-\u0652]') #tachkil TATWEEL = '\u0640' def clean_text(text, language="ar"): text = unicodedata.normalize("NFC", text) if language == "ar": text = re.sub(ARABIC_DIACRITICS, '', text) text = text.replace(TATWEEL, '') text = re.sub(r'[إأآا]', 'ا', text) text = text.replace('ى', 'ي') text = text.replace('ؤ', 'و').replace('ئ', 'ي') text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\s+', ' ', text) else: # French text = re.sub(r'\s+', ' ', text) return text.strip() def chunk_text(text, max_tokens): words = text.split() chunks = [] current_chunk = [] current_token_count = 0 for word in words: current_token_count += 1 current_chunk.append(word) if current_token_count >= max_tokens: chunk = ' '.join(current_chunk) chunks.append(chunk) current_chunk = [] current_token_count = 0 if current_chunk: chunk = ' '.join(current_chunk) chunks.append(chunk) return chunks def save_chunks_to_disk(chunks_data, output_file): """Save chunks to JSON file.""" with open(output_file, "w", encoding="utf-8") as f: #so that the output is in arabic letters not unicode characters json.dump(chunks_data, f, ensure_ascii=False, indent=2) def load_chunks_from_disk(input_file): """Load chunks from JSON file.""" with open(input_file, 'r', encoding='utf-8') as f: #open() without an encoding uses cp1252, which cannot decode many characters return json.load(f) def save_embeddings(embeddings, output_file): """Save embeddings to NumPy file.""" np.save(output_file, embeddings) def load_embeddings(input_file): """Load embeddings from NumPy file.""" return np.load(input_file) def load_metadata(input_file): """Load embeddings from NumPy file.""" return pd.read_csv(input_file) def load_prompt_template(path, variables: dict): with open(path, "r", encoding="utf-8") as f: template = f.read() for key, value in variables.items(): template = template.replace(f"{{{{{key}}}}}", str(value)) return template def load_youtube_data(csv_path: str): csv.field_size_limit(10_000_000) with open(csv_path, newline='', encoding='utf-8') as f: reader = csv.DictReader(f) return list(reader) def clean_agent_output(output: str) -> str: # Extract only Final Answer match = re.search(r"Final Answer:\s*(.*)", output, re.DOTALL | re.IGNORECASE) return match.group(1).strip() if match else output.strip()