Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import json | |
| import hashlib | |
| import unicodedata | |
| from glob import glob | |
| from typing import List, Dict, Any, Iterable | |
| import pandas as pd | |
| import faiss | |
| import torch | |
| import shutil | |
| # --- Important: Make sure to install the required libraries --- | |
| # pip install pandas pyarrow transformers sentence-transformers faiss-cpu | |
| # --- All necessary classes are included here for a self-contained script --- | |
| class Config: | |
| docstore_path: str = "indexes/docstore.parquet" | |
| glot_model_hf: str = "Arshiaizd/Glot500-FineTuned" | |
| mclip_text_model_hf: str = "Arshiaizd/MCLIP_FA_FineTuned" | |
| glot_index_out: str = "indexes/I_glot_text_fa.index" | |
| clip_index_out: str = "indexes/I_clip_text_fa.index" | |
| food_dataset_root: str = "./data/food_passages" | |
| max_text_len: int = 512 | |
| class Glot500Encoder: | |
| def __init__(self, model_id: str): | |
| from sentence_transformers import SentenceTransformer | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.st_model = SentenceTransformer(model_id, device=str(self.device)) | |
| def encode(self, texts: List[str], batch_size: int = 32) -> 'np.ndarray': | |
| import numpy as np | |
| return self.st_model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32) | |
| class FaTextEncoder: | |
| def __init__(self, model_id: str, device: torch.device, max_len: int): | |
| from transformers import AutoTokenizer, AutoModel | |
| self.device, self.max_len = device, max_len | |
| self.tok = AutoTokenizer.from_pretrained(model_id) | |
| self.model = AutoModel.from_pretrained(model_id).to(device).eval() | |
| def encode_numpy(self, texts: List[str], batch_size: int = 128) -> 'np.ndarray': | |
| import numpy as np | |
| vecs = [] | |
| for i in range(0, len(texts), batch_size): | |
| toks = self.tok(texts[i:i+batch_size], padding=True, truncation=True, max_length=self.max_len, return_tensors="pt").to(self.device) | |
| out = self.model(**toks) | |
| x = out.pooler_output if hasattr(out, "pooler_output") and out.pooler_output is not None else (out.last_hidden_state * toks.attention_mask.unsqueeze(-1)).sum(1) / toks.attention_mask.sum(1).clamp(min=1) | |
| x_norm = x / x.norm(p=2, dim=1, keepdim=True) | |
| vecs.append(x_norm.detach().cpu().numpy()) | |
| return np.vstack(vecs).astype(np.float32) | |
| class Utils: | |
| def _normalize_title(s: str) -> str: | |
| if s is None: return "" | |
| s = str(s).strip().replace("ي", "ی").replace("ك", "ک") | |
| s = re.sub(r"\s+", " ", s) | |
| s = re.sub(r"[^\w\u0600-\u06FF\s-]", "", s) | |
| return s.lower() | |
| def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]: | |
| # This more robust version can handle both single multi-line JSON objects | |
| # and line-delimited JSON. | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| txt = f.read().strip() | |
| if not txt: return | |
| try: | |
| # Try to parse the whole file as a single JSON object (list or dict) | |
| obj = json.loads(txt) | |
| if isinstance(obj, dict): | |
| yield obj | |
| return | |
| for it in obj if isinstance(obj, list) else []: | |
| if isinstance(it, dict): yield it | |
| return | |
| except json.JSONDecodeError: | |
| # If that fails, fall back to parsing line by line | |
| for line in txt.splitlines(): | |
| if not (line := line.strip()): continue | |
| try: | |
| if isinstance((obj := json.loads(line)), dict): yield obj | |
| except json.JSONDecodeError: | |
| continue | |
| def _collect_pairs(root: str) -> pd.DataFrame: | |
| rows = [] | |
| json_files = glob(os.path.join(root, "**/*.json"), recursive=True) | |
| if not json_files: | |
| print(f"Warning: No JSON files found in {root}. Please check the path.") | |
| return pd.DataFrame(rows) | |
| for jp in json_files: | |
| base_dir = os.path.dirname(jp) | |
| for rec in Utils._iter_json_records(jp): | |
| title, resp, img_rel = rec.get("title"), rec.get("response"), rec.get("image_path") | |
| if not all([title, resp, img_rel]): continue | |
| img_abs = os.path.normpath(os.path.join(base_dir, img_rel)) | |
| if not os.path.isfile(img_abs): continue | |
| rows.append({"title": str(title), "text": str(resp)}) | |
| return pd.DataFrame(rows) | |
| def _build_docstore(df: pd.DataFrame) -> pd.DataFrame: | |
| def _mk_id(row_text): | |
| return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16] | |
| # Check if the dataframe is empty before proceeding | |
| if 'text' not in df.columns: | |
| return pd.DataFrame(columns=['id', 'passage_text', 'title']) # Return empty docstore | |
| df['id'] = df['text'].apply(_mk_id) | |
| return df.rename(columns={'text': 'passage_text'}) | |
| def prep_dataset(root: str, out_docstore: str): | |
| print("Building docstore from source JSONs...") | |
| os.makedirs(os.path.dirname(out_docstore), exist_ok=True) | |
| df = Utils._collect_pairs(root) | |
| print(f"Found {len(df)} total passages.") | |
| if df.empty: | |
| print("Warning: No valid data found to process. The docstore will be empty.") | |
| doc = Utils._build_docstore(df) | |
| else: | |
| df.drop_duplicates(subset=['text'], keep='first', inplace=True) | |
| print(f"Found {len(df)} unique passages after deduplication.") | |
| doc = Utils._build_docstore(df) | |
| doc.to_parquet(out_docstore, index=False) | |
| print(f"Docstore saved to {out_docstore}.") | |
| return doc | |
| def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"): | |
| print(f"Building FAISS index: {os.path.basename(index_path)}") | |
| # Check if docstore is empty | |
| if docstore.empty: | |
| print("Docstore is empty. Skipping FAISS index creation.") | |
| return | |
| texts = docstore[text_col].astype(str).tolist() | |
| if hasattr(encoder, 'encode_numpy'): | |
| vecs = encoder.encode_numpy(texts) | |
| else: | |
| vecs = encoder.encode(texts) | |
| index = faiss.IndexFlatIP(vecs.shape[1]) | |
| index.add(vecs.astype('float32')) | |
| faiss.write_index(index, index_path) | |
| print("Index built and saved successfully.") | |
| def main(): | |
| cfg = Config() | |
| # Clean up old indexes first | |
| if os.path.isdir("indexes"): | |
| print("Removing old 'indexes' directory...") | |
| shutil.rmtree("indexes") | |
| # 1. Create the deduplicated docstore | |
| docstore = Utils.prep_dataset(root=cfg.food_dataset_root, out_docstore=cfg.docstore_path) | |
| # 2. Build Glot index | |
| print("\n--- Building Glot Index ---") | |
| glot_encoder = Glot500Encoder(cfg.glot_model_hf) | |
| build_faiss_index(glot_encoder, docstore, cfg.glot_index_out) | |
| # 3. Build CLIP index | |
| print("\n--- Building CLIP Text Index ---") | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| clip_text_encoder = FaTextEncoder(cfg.mclip_text_model_hf, device, cfg.max_text_len) | |
| build_faiss_index(clip_text_encoder, docstore, cfg.clip_index_out) | |
| print("\nAll new indexes have been created successfully!") | |
| if __name__ == "__main__": | |
| main() | |