Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

App Files Files Community

RAG_APP / src /utils /helpers.py

sxid003

Upload 83 files

3107242 verified 6 months ago

raw

history blame contribute delete

2.81 kB

	import re
	import unicodedata
	import json
	import numpy as np
	import pandas as pd
	import csv
	import re
	import unicodedata

	ARABIC_DIACRITICS = re.compile(r'[\u0617-\u061A\u064B-\u0652]') #tachkil
	TATWEEL = '\u0640'

	def clean_text(text, language="ar"):
	text = unicodedata.normalize("NFC", text)
	if language == "ar":
	text = re.sub(ARABIC_DIACRITICS, '', text)
	text = text.replace(TATWEEL, '')
	text = re.sub(r'[إأآا]', 'ا', text)
	text = text.replace('ى', 'ي')
	text = text.replace('ؤ', 'و').replace('ئ', 'ي')
	text = re.sub(r'[^\w\s]', '', text)
	text = re.sub(r'\s+', ' ', text)
	else: # French
	text = re.sub(r'\s+', ' ', text)
	return text.strip()


	def chunk_text(text, max_tokens):

	words = text.split()
	chunks = []
	current_chunk = []
	current_token_count = 0

	for word in words:
	current_token_count += 1
	current_chunk.append(word)
	if current_token_count >= max_tokens:
	chunk = ' '.join(current_chunk)
	chunks.append(chunk)
	current_chunk = []
	current_token_count = 0

	if current_chunk:
	chunk = ' '.join(current_chunk)
	chunks.append(chunk)

	return chunks


	def save_chunks_to_disk(chunks_data, output_file):
	"""Save chunks to JSON file."""
	with open(output_file, "w", encoding="utf-8") as f: #so that the output is in arabic letters not unicode characters
	json.dump(chunks_data, f, ensure_ascii=False, indent=2)

	def load_chunks_from_disk(input_file):
	"""Load chunks from JSON file."""
	with open(input_file, 'r', encoding='utf-8') as f: #open() without an encoding uses cp1252, which cannot decode many characters
	return json.load(f)

	def save_embeddings(embeddings, output_file):
	"""Save embeddings to NumPy file."""
	np.save(output_file, embeddings)

	def load_embeddings(input_file):
	"""Load embeddings from NumPy file."""
	return np.load(input_file)

	def load_metadata(input_file):
	"""Load embeddings from NumPy file."""
	return pd.read_csv(input_file)

	def load_prompt_template(path, variables: dict):
	with open(path, "r", encoding="utf-8") as f:
	template = f.read()
	for key, value in variables.items():
	template = template.replace(f"{{{{{key}}}}}", str(value))
	return template


	def load_youtube_data(csv_path: str):
	csv.field_size_limit(10_000_000)
	with open(csv_path, newline='', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	return list(reader)


	def clean_agent_output(output: str) -> str:
	# Extract only Final Answer
	match = re.search(r"Final Answer:\s(.)", output, re.DOTALL \| re.IGNORECASE)
	return match.group(1).strip() if match else output.strip()