Sai809701 commited on
Commit
ac59d2f
·
1 Parent(s): 1ceae90

added all files

Browse files
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ .env/
27
+ .venv/
28
+
29
+ # IDEs and Editors
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Jupyter Notebook
37
+ .ipynb_checkpoints
38
+
39
+ # Model files and data
40
+ *.h5
41
+ *.pkl
42
+ *.model
43
+ data/
44
+ models/
45
+ logs/
46
+ model.safetensors
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python image
2
+ FROM python:3.10-slim
3
+
4
+ WORKDIR /app
5
+
6
+ # Copy and install dependencies
7
+ COPY requirements.txt .
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ # Copy all files
11
+ COPY . .
12
+
13
+ # Precompute embeddings at build time
14
+ RUN python precompute_embeddings.py
15
+
16
+ # Expose FastAPI port
17
+ EXPOSE 7860
18
+
19
+ # Run FastAPI
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+ from sentence_transformers import SentenceTransformer, util
6
+ import pandas as pd
7
+ from langdetect import detect, DetectorFactory
8
+
9
+ # To ensure consistent language detection
10
+ DetectorFactory.seed = 0
11
+
12
+ # --- Configuration ---
13
+ MODEL_PATH = './muril_combined_multilingual_model'
14
+ CSV_PATH = './muril_multilingual_dataset.csv'
15
+
16
+ # --- Load MuRIL Model and Dataset ---
17
+ def load_resources():
18
+ try:
19
+ model = SentenceTransformer(MODEL_PATH)
20
+ df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
21
+
22
+ # If language column not available, default to 'unknown'
23
+ if 'lang' not in df.columns:
24
+ df['lang'] = 'unknown'
25
+
26
+ answers = df['answer'].tolist()
27
+ answer_embeddings = model.encode(answers, convert_to_tensor=True)
28
+
29
+ print("✅ Resources loaded successfully!")
30
+ return model, df, answer_embeddings
31
+ except Exception as e:
32
+ print(f"❌ Error loading resources: {e}")
33
+ return None, None, None
34
+
35
+ model, df, answer_embeddings = load_resources()
36
+
37
+ # --- Initialize FastAPI ---
38
+ app = FastAPI(title="MuRIL Multilingual QA API")
39
+
40
+ # --- API Data Models ---
41
+ class QueryRequest(BaseModel):
42
+ question: str
43
+
44
+ class QAResponse(BaseModel):
45
+ detected_lang: str
46
+ answer: str
47
+ score: float
48
+
49
+ # --- Helper: Language Detection ---
50
+ def detect_language(text: str) -> str:
51
+ try:
52
+ lang_code = detect(text)
53
+ return lang_code
54
+ except Exception:
55
+ return "unknown"
56
+
57
+ # --- API Endpoints ---
58
+ @app.post("/get-answer", response_model=QAResponse)
59
+ def get_answer_endpoint(request: QueryRequest):
60
+ """
61
+ Retrieve the best matching answer for a given question.
62
+ Automatically detects language using langdetect.
63
+ """
64
+ if not model:
65
+ return {"detected_lang": "none", "answer": "Model not loaded.", "score": 0.0}
66
+
67
+ # 1️⃣ Detect language automatically
68
+ detected_lang = detect_language(request.question)
69
+ print(f"🌐 Detected language: {detected_lang}")
70
+
71
+ # 2️⃣ Filter dataset by detected language (if available)
72
+ subset_df = df[df['lang'].str.lower() == detected_lang.lower()]
73
+ if subset_df.empty:
74
+ subset_df = df # fallback: use all data if no match
75
+
76
+ subset_embeddings = model.encode(subset_df['answer'].tolist(), convert_to_tensor=True)
77
+
78
+ # 3️⃣ Compute similarity
79
+ question_emb = model.encode(request.question, convert_to_tensor=True)
80
+ cosine_scores = util.pytorch_cos_sim(question_emb, subset_embeddings)
81
+
82
+ # 4️⃣ Find best match
83
+ best_idx = torch.argmax(cosine_scores).item()
84
+ best_score = cosine_scores[0][best_idx].item()
85
+ answer = subset_df.iloc[best_idx]['answer']
86
+
87
+ return {
88
+ "detected_lang": detected_lang,
89
+ "answer": answer,
90
+ "score": round(best_score, 4)
91
+ }
92
+
93
+ @app.get("/")
94
+ def read_root():
95
+ return {"status": "MuRIL Multilingual QA API is running."}
muril_combined_multilingual_model/config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "SentenceTransformer",
3
+ "__version__": {
4
+ "sentence_transformers": "5.1.1",
5
+ "transformers": "4.56.2",
6
+ "pytorch": "2.8.0+cu126"
7
+ },
8
+ "prompts": {
9
+ "query": "",
10
+ "document": ""
11
+ },
12
+ "default_prompt_name": null,
13
+ "similarity_fn_name": "cosine"
14
+ }
muril_combined_multilingual_model/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
muril_combined_multilingual_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
muril_combined_multilingual_model/tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "extra_special_tokens": {},
49
+ "lowercase": false,
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "never_split": null,
53
+ "pad_token": "[PAD]",
54
+ "sep_token": "[SEP]",
55
+ "strip_accents": false,
56
+ "tokenize_chinese_chars": true,
57
+ "tokenizer_class": "BertTokenizer",
58
+ "unk_token": "[UNK]"
59
+ }
precompute_embeddings.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ MODEL_PATH = './muril_combined_multilingual_model'
6
+ CSV_PATH = './muril_multilingual_dataset.csv'
7
+ EMB_PATH = './answer_embeddings.pt'
8
+
9
+ print("🔄 Precomputing embeddings...")
10
+ model = SentenceTransformer(MODEL_PATH)
11
+ df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
12
+
13
+ answers = df['answer'].tolist()
14
+ answer_embeddings = model.encode(answers, convert_to_tensor=True)
15
+ torch.save(answer_embeddings, EMB_PATH)
16
+
17
+ print(f"✅ Saved {len(answers)} embeddings to {EMB_PATH}")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ torch==2.1.0+cpu
4
+ sentence-transformers
5
+ pandas
6
+ langdetect
7
+ requests
8
+ tqdm
9
+ PyMuPDF