File size: 2,806 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import re
import unicodedata
import json
import numpy as np
import pandas as pd
import csv
import re
import unicodedata

ARABIC_DIACRITICS = re.compile(r'[\u0617-\u061A\u064B-\u0652]')  #tachkil
TATWEEL = '\u0640'

def clean_text(text, language="ar"):
    text = unicodedata.normalize("NFC", text)
    if language == "ar":
        text = re.sub(ARABIC_DIACRITICS, '', text)
        text = text.replace(TATWEEL, '')
        text = re.sub(r'[إأآا]', 'ا', text)
        text = text.replace('ى', 'ي')
        text = text.replace('ؤ', 'و').replace('ئ', 'ي')
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
    else:  # French
        text = re.sub(r'\s+', ' ', text)
    return text.strip()


def chunk_text(text, max_tokens): 
    
    words = text.split()
    chunks = []
    current_chunk = []
    current_token_count = 0

    for word in words:
        current_token_count += 1
        current_chunk.append(word)
        if current_token_count >= max_tokens:
            chunk = ' '.join(current_chunk)
            chunks.append(chunk)
            current_chunk = []
            current_token_count = 0

    if current_chunk:
        chunk = ' '.join(current_chunk)
        chunks.append(chunk)

    return chunks


def save_chunks_to_disk(chunks_data, output_file):
    """Save chunks to JSON file."""
    with open(output_file, "w", encoding="utf-8") as f:       #so that the output is in arabic letters not unicode characters
        json.dump(chunks_data, f, ensure_ascii=False, indent=2)

def load_chunks_from_disk(input_file):
    """Load chunks from JSON file."""
    with open(input_file, 'r', encoding='utf-8') as f:  #open() without an encoding uses cp1252, which cannot decode many characters
        return json.load(f)

def save_embeddings(embeddings, output_file):
    """Save embeddings to NumPy file."""
    np.save(output_file, embeddings)

def load_embeddings(input_file):
    """Load embeddings from NumPy file."""
    return np.load(input_file)

def load_metadata(input_file):
    """Load embeddings from NumPy file."""
    return pd.read_csv(input_file)

def load_prompt_template(path, variables: dict):
    with open(path, "r", encoding="utf-8") as f:
        template = f.read()
    for key, value in variables.items():
        template = template.replace(f"{{{{{key}}}}}", str(value))
    return template


def load_youtube_data(csv_path: str):
    csv.field_size_limit(10_000_000)
    with open(csv_path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return list(reader)
    

def clean_agent_output(output: str) -> str:
    # Extract only Final Answer
    match = re.search(r"Final Answer:\s*(.*)", output, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else output.strip()