Spaces:

Sp2503
/

Muril-Model

Sleeping

App Files Files Community

Sai809701 commited on Oct 5, 2025

Commit

ac59d2f

1 Parent(s): 1ceae90

added all files

Browse files

Files changed (9) hide show

.gitignore +46 -0
Dockerfile +20 -0
main.py +95 -0
muril_combined_multilingual_model/config_sentence_transformers.json +14 -0
muril_combined_multilingual_model/modules.json +14 -0
muril_combined_multilingual_model/tokenizer.json +0 -0
muril_combined_multilingual_model/tokenizer_config.json +59 -0
precompute_embeddings.py +17 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+.env/
+.venv/
+# IDEs and Editors
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+# Jupyter Notebook
+.ipynb_checkpoints
+# Model files and data
+*.h5
+*.pkl
+*.model
+data/
+models/
+logs/
+model.safetensors

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use official Python image
+FROM python:3.10-slim
+WORKDIR /app
+# Copy and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy all files
+COPY . .
+# Precompute embeddings at build time
+RUN python precompute_embeddings.py
+# Expose FastAPI port
+EXPOSE 7860
+# Run FastAPI
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import torch
+from fastapi import FastAPI
+from pydantic import BaseModel
+from sentence_transformers import SentenceTransformer, util
+import pandas as pd
+from langdetect import detect, DetectorFactory
+# To ensure consistent language detection
+DetectorFactory.seed = 0
+# --- Configuration ---
+MODEL_PATH = './muril_combined_multilingual_model'
+CSV_PATH = './muril_multilingual_dataset.csv'
+# --- Load MuRIL Model and Dataset ---
+def load_resources():
+    try:
+        model = SentenceTransformer(MODEL_PATH)
+        df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
+        # If language column not available, default to 'unknown'
+        if 'lang' not in df.columns:
+            df['lang'] = 'unknown'
+        answers = df['answer'].tolist()
+        answer_embeddings = model.encode(answers, convert_to_tensor=True)
+        print("✅ Resources loaded successfully!")
+        return model, df, answer_embeddings
+    except Exception as e:
+        print(f"❌ Error loading resources: {e}")
+        return None, None, None
+model, df, answer_embeddings = load_resources()
+# --- Initialize FastAPI ---
+app = FastAPI(title="MuRIL Multilingual QA API")
+# --- API Data Models ---
+class QueryRequest(BaseModel):
+    question: str
+class QAResponse(BaseModel):
+    detected_lang: str
+    answer: str
+    score: float
+# --- Helper: Language Detection ---
+def detect_language(text: str) -> str:
+    try:
+        lang_code = detect(text)
+        return lang_code
+    except Exception:
+        return "unknown"
+# --- API Endpoints ---
+@app.post("/get-answer", response_model=QAResponse)
+def get_answer_endpoint(request: QueryRequest):
+    """
+    Retrieve the best matching answer for a given question.
+    Automatically detects language using langdetect.
+    """
+    if not model:
+        return {"detected_lang": "none", "answer": "Model not loaded.", "score": 0.0}
+    # 1️⃣ Detect language automatically
+    detected_lang = detect_language(request.question)
+    print(f"🌐 Detected language: {detected_lang}")
+    # 2️⃣ Filter dataset by detected language (if available)
+    subset_df = df[df['lang'].str.lower() == detected_lang.lower()]
+    if subset_df.empty:
+        subset_df = df  # fallback: use all data if no match
+    subset_embeddings = model.encode(subset_df['answer'].tolist(), convert_to_tensor=True)
+    # 3️⃣ Compute similarity
+    question_emb = model.encode(request.question, convert_to_tensor=True)
+    cosine_scores = util.pytorch_cos_sim(question_emb, subset_embeddings)
+    # 4️⃣ Find best match
+    best_idx = torch.argmax(cosine_scores).item()
+    best_score = cosine_scores[0][best_idx].item()
+    answer = subset_df.iloc[best_idx]['answer']
+    return {
+        "detected_lang": detected_lang,
+        "answer": answer,
+        "score": round(best_score, 4)
+    }
+@app.get("/")
+def read_root():
+    return {"status": "MuRIL Multilingual QA API is running."}

muril_combined_multilingual_model/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_type": "SentenceTransformer",
+  "__version__": {
+    "sentence_transformers": "5.1.1",
+    "transformers": "4.56.2",
+    "pytorch": "2.8.0+cu126"
+  },
+  "prompts": {
+    "query": "",
+    "document": ""
+  },
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

muril_combined_multilingual_model/modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

muril_combined_multilingual_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

muril_combined_multilingual_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "105": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "extra_special_tokens": {},
+  "lowercase": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": false,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

precompute_embeddings.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+MODEL_PATH = './muril_combined_multilingual_model'
+CSV_PATH = './muril_multilingual_dataset.csv'
+EMB_PATH = './answer_embeddings.pt'
+print("🔄 Precomputing embeddings...")
+model = SentenceTransformer(MODEL_PATH)
+df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
+answers = df['answer'].tolist()
+answer_embeddings = model.encode(answers, convert_to_tensor=True)
+torch.save(answer_embeddings, EMB_PATH)
+print(f"✅ Saved {len(answers)} embeddings to {EMB_PATH}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+torch==2.1.0+cpu
+sentence-transformers
+pandas
+langdetect
+requests
+tqdm
+PyMuPDF