Spaces:
Sleeping
Sleeping
| import re | |
| import unicodedata | |
| import json | |
| import numpy as np | |
| import pandas as pd | |
| import csv | |
| import re | |
| import unicodedata | |
| ARABIC_DIACRITICS = re.compile(r'[\u0617-\u061A\u064B-\u0652]') #tachkil | |
| TATWEEL = '\u0640' | |
| def clean_text(text, language="ar"): | |
| text = unicodedata.normalize("NFC", text) | |
| if language == "ar": | |
| text = re.sub(ARABIC_DIACRITICS, '', text) | |
| text = text.replace(TATWEEL, '') | |
| text = re.sub(r'[إأآا]', 'ا', text) | |
| text = text.replace('ى', 'ي') | |
| text = text.replace('ؤ', 'و').replace('ئ', 'ي') | |
| text = re.sub(r'[^\w\s]', '', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| else: # French | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def chunk_text(text, max_tokens): | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_token_count = 0 | |
| for word in words: | |
| current_token_count += 1 | |
| current_chunk.append(word) | |
| if current_token_count >= max_tokens: | |
| chunk = ' '.join(current_chunk) | |
| chunks.append(chunk) | |
| current_chunk = [] | |
| current_token_count = 0 | |
| if current_chunk: | |
| chunk = ' '.join(current_chunk) | |
| chunks.append(chunk) | |
| return chunks | |
| def save_chunks_to_disk(chunks_data, output_file): | |
| """Save chunks to JSON file.""" | |
| with open(output_file, "w", encoding="utf-8") as f: #so that the output is in arabic letters not unicode characters | |
| json.dump(chunks_data, f, ensure_ascii=False, indent=2) | |
| def load_chunks_from_disk(input_file): | |
| """Load chunks from JSON file.""" | |
| with open(input_file, 'r', encoding='utf-8') as f: #open() without an encoding uses cp1252, which cannot decode many characters | |
| return json.load(f) | |
| def save_embeddings(embeddings, output_file): | |
| """Save embeddings to NumPy file.""" | |
| np.save(output_file, embeddings) | |
| def load_embeddings(input_file): | |
| """Load embeddings from NumPy file.""" | |
| return np.load(input_file) | |
| def load_metadata(input_file): | |
| """Load embeddings from NumPy file.""" | |
| return pd.read_csv(input_file) | |
| def load_prompt_template(path, variables: dict): | |
| with open(path, "r", encoding="utf-8") as f: | |
| template = f.read() | |
| for key, value in variables.items(): | |
| template = template.replace(f"{{{{{key}}}}}", str(value)) | |
| return template | |
| def load_youtube_data(csv_path: str): | |
| csv.field_size_limit(10_000_000) | |
| with open(csv_path, newline='', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| return list(reader) | |
| def clean_agent_output(output: str) -> str: | |
| # Extract only Final Answer | |
| match = re.search(r"Final Answer:\s*(.*)", output, re.DOTALL | re.IGNORECASE) | |
| return match.group(1).strip() if match else output.strip() | |