RAG_APP / src /utils /helpers.py
sxid003's picture
Upload 83 files
3107242 verified
import re
import unicodedata
import json
import numpy as np
import pandas as pd
import csv
import re
import unicodedata
ARABIC_DIACRITICS = re.compile(r'[\u0617-\u061A\u064B-\u0652]') #tachkil
TATWEEL = '\u0640'
def clean_text(text, language="ar"):
text = unicodedata.normalize("NFC", text)
if language == "ar":
text = re.sub(ARABIC_DIACRITICS, '', text)
text = text.replace(TATWEEL, '')
text = re.sub(r'[إأآا]', 'ا', text)
text = text.replace('ى', 'ي')
text = text.replace('ؤ', 'و').replace('ئ', 'ي')
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text)
else: # French
text = re.sub(r'\s+', ' ', text)
return text.strip()
def chunk_text(text, max_tokens):
words = text.split()
chunks = []
current_chunk = []
current_token_count = 0
for word in words:
current_token_count += 1
current_chunk.append(word)
if current_token_count >= max_tokens:
chunk = ' '.join(current_chunk)
chunks.append(chunk)
current_chunk = []
current_token_count = 0
if current_chunk:
chunk = ' '.join(current_chunk)
chunks.append(chunk)
return chunks
def save_chunks_to_disk(chunks_data, output_file):
"""Save chunks to JSON file."""
with open(output_file, "w", encoding="utf-8") as f: #so that the output is in arabic letters not unicode characters
json.dump(chunks_data, f, ensure_ascii=False, indent=2)
def load_chunks_from_disk(input_file):
"""Load chunks from JSON file."""
with open(input_file, 'r', encoding='utf-8') as f: #open() without an encoding uses cp1252, which cannot decode many characters
return json.load(f)
def save_embeddings(embeddings, output_file):
"""Save embeddings to NumPy file."""
np.save(output_file, embeddings)
def load_embeddings(input_file):
"""Load embeddings from NumPy file."""
return np.load(input_file)
def load_metadata(input_file):
"""Load embeddings from NumPy file."""
return pd.read_csv(input_file)
def load_prompt_template(path, variables: dict):
with open(path, "r", encoding="utf-8") as f:
template = f.read()
for key, value in variables.items():
template = template.replace(f"{{{{{key}}}}}", str(value))
return template
def load_youtube_data(csv_path: str):
csv.field_size_limit(10_000_000)
with open(csv_path, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
return list(reader)
def clean_agent_output(output: str) -> str:
# Extract only Final Answer
match = re.search(r"Final Answer:\s*(.*)", output, re.DOTALL | re.IGNORECASE)
return match.group(1).strip() if match else output.strip()