Spaces:

parsi-ai-nlpclass
/

Persian-Food-RAG

Sleeping

App Files Files Community

Persian-Food-RAG / app.py

sadegh803211

Update app.py

f009aa1 verified 3 months ago

raw

history blame

7.45 kB

	import os
	import re
	import json
	import hashlib
	import unicodedata
	from glob import glob
	from typing import List, Dict, Any, Iterable

	import pandas as pd
	import faiss
	import torch
	import shutil

	# --- Important: Make sure to install the required libraries ---
	# pip install pandas pyarrow transformers sentence-transformers faiss-cpu

	# --- All necessary classes are included here for a self-contained script ---

	class Config:
	docstore_path: str = "indexes/docstore.parquet"
	glot_model_hf: str = "Arshiaizd/Glot500-FineTuned"
	mclip_text_model_hf: str = "Arshiaizd/MCLIP_FA_FineTuned"
	glot_index_out: str = "indexes/I_glot_text_fa.index"
	clip_index_out: str = "indexes/I_clip_text_fa.index"
	food_dataset_root: str = "./data/food_passages"
	max_text_len: int = 512

	class Glot500Encoder:
	def __init__(self, model_id: str):
	from sentence_transformers import SentenceTransformer
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.st_model = SentenceTransformer(model_id, device=str(self.device))
	def encode(self, texts: List[str], batch_size: int = 32) -> 'np.ndarray':
	import numpy as np
	return self.st_model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)

	class FaTextEncoder:
	def __init__(self, model_id: str, device: torch.device, max_len: int):
	from transformers import AutoTokenizer, AutoModel
	self.device, self.max_len = device, max_len
	self.tok = AutoTokenizer.from_pretrained(model_id)
	self.model = AutoModel.from_pretrained(model_id).to(device).eval()
	def encode_numpy(self, texts: List[str], batch_size: int = 128) -> 'np.ndarray':
	import numpy as np
	vecs = []
	for i in range(0, len(texts), batch_size):
	toks = self.tok(texts[i:i+batch_size], padding=True, truncation=True, max_length=self.max_len, return_tensors="pt").to(self.device)
	out = self.model(**toks)
	x = out.pooler_output if hasattr(out, "pooler_output") and out.pooler_output is not None else (out.last_hidden_state * toks.attention_mask.unsqueeze(-1)).sum(1) / toks.attention_mask.sum(1).clamp(min=1)
	x_norm = x / x.norm(p=2, dim=1, keepdim=True)
	vecs.append(x_norm.detach().cpu().numpy())
	return np.vstack(vecs).astype(np.float32)

	class Utils:
	@staticmethod
	def _normalize_title(s: str) -> str:
	if s is None: return ""
	s = str(s).strip().replace("ي", "ی").replace("ك", "ک")
	s = re.sub(r"\s+", " ", s)
	s = re.sub(r"[^\w\u0600-\u06FF\s-]", "", s)
	return s.lower()

	@staticmethod
	def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]:
	# This more robust version can handle both single multi-line JSON objects
	# and line-delimited JSON.
	with open(json_path, "r", encoding="utf-8") as f:
	txt = f.read().strip()
	if not txt: return
	try:
	# Try to parse the whole file as a single JSON object (list or dict)
	obj = json.loads(txt)
	if isinstance(obj, dict):
	yield obj
	return
	for it in obj if isinstance(obj, list) else []:
	if isinstance(it, dict): yield it
	return
	except json.JSONDecodeError:
	# If that fails, fall back to parsing line by line
	for line in txt.splitlines():
	if not (line := line.strip()): continue
	try:
	if isinstance((obj := json.loads(line)), dict): yield obj
	except json.JSONDecodeError:
	continue

	@staticmethod
	def _collect_pairs(root: str) -> pd.DataFrame:
	rows = []
	json_files = glob(os.path.join(root, "*/.json"), recursive=True)
	if not json_files:
	print(f"Warning: No JSON files found in {root}. Please check the path.")
	return pd.DataFrame(rows)

	for jp in json_files:
	base_dir = os.path.dirname(jp)
	for rec in Utils._iter_json_records(jp):
	title, resp, img_rel = rec.get("title"), rec.get("response"), rec.get("image_path")
	if not all([title, resp, img_rel]): continue
	img_abs = os.path.normpath(os.path.join(base_dir, img_rel))
	if not os.path.isfile(img_abs): continue
	rows.append({"title": str(title), "text": str(resp)})
	return pd.DataFrame(rows)

	@staticmethod
	def _build_docstore(df: pd.DataFrame) -> pd.DataFrame:
	def _mk_id(row_text):
	return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16]
	# Check if the dataframe is empty before proceeding
	if 'text' not in df.columns:
	return pd.DataFrame(columns=['id', 'passage_text', 'title']) # Return empty docstore
	df['id'] = df['text'].apply(_mk_id)
	return df.rename(columns={'text': 'passage_text'})

	@staticmethod
	def prep_dataset(root: str, out_docstore: str):
	print("Building docstore from source JSONs...")
	os.makedirs(os.path.dirname(out_docstore), exist_ok=True)
	df = Utils._collect_pairs(root)
	print(f"Found {len(df)} total passages.")

	if df.empty:
	print("Warning: No valid data found to process. The docstore will be empty.")
	doc = Utils._build_docstore(df)
	else:
	df.drop_duplicates(subset=['text'], keep='first', inplace=True)
	print(f"Found {len(df)} unique passages after deduplication.")
	doc = Utils._build_docstore(df)

	doc.to_parquet(out_docstore, index=False)
	print(f"Docstore saved to {out_docstore}.")
	return doc

	def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"):
	print(f"Building FAISS index: {os.path.basename(index_path)}")
	# Check if docstore is empty
	if docstore.empty:
	print("Docstore is empty. Skipping FAISS index creation.")
	return

	texts = docstore[text_col].astype(str).tolist()
	if hasattr(encoder, 'encode_numpy'):
	vecs = encoder.encode_numpy(texts)
	else:
	vecs = encoder.encode(texts)

	index = faiss.IndexFlatIP(vecs.shape[1])
	index.add(vecs.astype('float32'))
	faiss.write_index(index, index_path)
	print("Index built and saved successfully.")

	def main():
	cfg = Config()

	# Clean up old indexes first
	if os.path.isdir("indexes"):
	print("Removing old 'indexes' directory...")
	shutil.rmtree("indexes")

	# 1. Create the deduplicated docstore
	docstore = Utils.prep_dataset(root=cfg.food_dataset_root, out_docstore=cfg.docstore_path)

	# 2. Build Glot index
	print("\n--- Building Glot Index ---")
	glot_encoder = Glot500Encoder(cfg.glot_model_hf)
	build_faiss_index(glot_encoder, docstore, cfg.glot_index_out)

	# 3. Build CLIP index
	print("\n--- Building CLIP Text Index ---")
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	clip_text_encoder = FaTextEncoder(cfg.mclip_text_model_hf, device, cfg.max_text_len)
	build_faiss_index(clip_text_encoder, docstore, cfg.clip_index_out)

	print("\nAll new indexes have been created successfully!")

	if __name__ == "__main__":
	main()