Sp2503 commited on
Commit
3f83269
·
verified ·
1 Parent(s): 8283750

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +14 -0
  2. Muril-Model/.gitattributes +36 -0
  3. Muril-Model/.gitignore +45 -0
  4. Muril-Model/Dockerfile +34 -0
  5. Muril-Model/Muril-Model/.gitattributes +36 -0
  6. Muril-Model/Muril-Model/.gitignore +45 -0
  7. Muril-Model/Muril-Model/Dockerfile +28 -0
  8. Muril-Model/Muril-Model/README.md +10 -0
  9. Muril-Model/Muril-Model/answer_embeddings.pt +3 -0
  10. Muril-Model/Muril-Model/main.py +70 -0
  11. Muril-Model/Muril-Model/muril_combined_multilingual_model/1_Pooling/config.json +10 -0
  12. Muril-Model/Muril-Model/muril_combined_multilingual_model/config.json +25 -0
  13. Muril-Model/Muril-Model/muril_combined_multilingual_model/config_sentence_transformers.json +14 -0
  14. Muril-Model/Muril-Model/muril_combined_multilingual_model/modules.json +14 -0
  15. Muril-Model/Muril-Model/muril_combined_multilingual_model/sentence_bert_config.json +4 -0
  16. Muril-Model/Muril-Model/muril_combined_multilingual_model/special_tokens_map.json +7 -0
  17. Muril-Model/Muril-Model/muril_combined_multilingual_model/tokenizer.json +0 -0
  18. Muril-Model/Muril-Model/muril_combined_multilingual_model/tokenizer_config.json +59 -0
  19. Muril-Model/Muril-Model/muril_combined_multilingual_model/vocab.txt +0 -0
  20. Muril-Model/Muril-Model/muril_multilingual_dataset.csv +3 -0
  21. Muril-Model/Muril-Model/precompute_embeddings.py +17 -0
  22. Muril-Model/Muril-Model/requirements.txt +12 -0
  23. Muril-Model/README.md +10 -0
  24. Muril-Model/answer_embeddings.pt +3 -0
  25. Muril-Model/app.py +4 -0
  26. Muril-Model/main.py +107 -0
  27. Muril-Model/muril_combined_multilingual_model/1_Pooling/config.json +10 -0
  28. Muril-Model/muril_combined_multilingual_model/config.json +25 -0
  29. Muril-Model/muril_combined_multilingual_model/config_sentence_transformers.json +14 -0
  30. Muril-Model/muril_combined_multilingual_model/model.safetensors +3 -0
  31. Muril-Model/muril_combined_multilingual_model/modules.json +14 -0
  32. Muril-Model/muril_combined_multilingual_model/sentence_bert_config.json +4 -0
  33. Muril-Model/muril_combined_multilingual_model/special_tokens_map.json +7 -0
  34. Muril-Model/muril_combined_multilingual_model/tokenizer.json +0 -0
  35. Muril-Model/muril_combined_multilingual_model/tokenizer_config.json +59 -0
  36. Muril-Model/muril_combined_multilingual_model/vocab.txt +0 -0
  37. Muril-Model/muril_multilingual_dataset.csv +3 -0
  38. Muril-Model/precompute_embeddings.py +17 -0
  39. Muril-Model/requirements.txt +12 -0
  40. New Text Document.txt +0 -0
  41. __pycache__/main.cpython-312.pyc +0 -0
  42. murilmodeltraining.ipynb +0 -0
  43. pdfs/constitution_1.pdf +3 -0
  44. pdfs/constitution_10.pdf +3 -0
  45. pdfs/constitution_11.pdf +3 -0
  46. pdfs/constitution_12.pdf +3 -0
  47. pdfs/constitution_2.pdf +3 -0
  48. pdfs/constitution_3.pdf +3 -0
  49. pdfs/constitution_4.pdf +3 -0
  50. pdfs/constitution_5.pdf +3 -0
.gitattributes CHANGED
@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Muril-Model/Muril-Model/muril_multilingual_dataset.csv filter=lfs diff=lfs merge=lfs -text
37
+ Muril-Model/muril_multilingual_dataset.csv filter=lfs diff=lfs merge=lfs -text
38
+ pdfs/constitution_1.pdf filter=lfs diff=lfs merge=lfs -text
39
+ pdfs/constitution_10.pdf filter=lfs diff=lfs merge=lfs -text
40
+ pdfs/constitution_11.pdf filter=lfs diff=lfs merge=lfs -text
41
+ pdfs/constitution_12.pdf filter=lfs diff=lfs merge=lfs -text
42
+ pdfs/constitution_2.pdf filter=lfs diff=lfs merge=lfs -text
43
+ pdfs/constitution_3.pdf filter=lfs diff=lfs merge=lfs -text
44
+ pdfs/constitution_4.pdf filter=lfs diff=lfs merge=lfs -text
45
+ pdfs/constitution_5.pdf filter=lfs diff=lfs merge=lfs -text
46
+ pdfs/constitution_6.pdf filter=lfs diff=lfs merge=lfs -text
47
+ pdfs/constitution_7.pdf filter=lfs diff=lfs merge=lfs -text
48
+ pdfs/constitution_8.pdf filter=lfs diff=lfs merge=lfs -text
49
+ pdfs/constitution_9.pdf filter=lfs diff=lfs merge=lfs -text
Muril-Model/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ muril_multilingual_dataset.csv filter=lfs diff=lfs merge=lfs -text
Muril-Model/.gitignore ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ .env/
27
+ .venv/
28
+
29
+ # IDEs and Editors
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Jupyter Notebook
37
+ .ipynb_checkpoints
38
+
39
+ # Model files and data
40
+ *.h5
41
+ *.pkl
42
+ *.model
43
+ data/
44
+ models/
45
+ logs/
Muril-Model/Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use lightweight Python base image
2
+ FROM python:3.10-slim
3
+
4
+ # Disable CUDA & set cache locations
5
+ ENV TORCH_DISABLE_CUDA=1
6
+ ENV TRANSFORMERS_CACHE=/app/hf_cache
7
+ ENV HF_HOME=/app/hf_cache
8
+
9
+ # Working directory
10
+ WORKDIR /app
11
+
12
+ # Copy dependency list
13
+ COPY requirements.txt .
14
+
15
+ # Install dependencies efficiently
16
+ RUN apt-get update && apt-get install -y git && \
17
+ pip install --no-cache-dir -r requirements.txt && \
18
+ rm -rf /var/lib/apt/lists/*
19
+
20
+ # Copy app code
21
+ COPY . .
22
+
23
+ # Create cache folder (writable)
24
+ RUN mkdir -p /app/hf_cache && chmod -R 777 /app/hf_cache
25
+
26
+ # Expose port for FastAPI
27
+ EXPOSE 8080
28
+
29
+ # Health check (optional for Cloud Run / Spaces)
30
+ HEALTHCHECK CMD curl --fail http://localhost:8080/ || exit 1
31
+
32
+ # Run the FastAPI app
33
+ # Using "app:app" ensures fast startup since app.py imports main.py lazily.
34
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"]
Muril-Model/Muril-Model/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ muril_multilingual_dataset.csv filter=lfs diff=lfs merge=lfs -text
Muril-Model/Muril-Model/.gitignore ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ .env/
27
+ .venv/
28
+
29
+ # IDEs and Editors
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Jupyter Notebook
37
+ .ipynb_checkpoints
38
+
39
+ # Model files and data
40
+ *.h5
41
+ *.pkl
42
+ *.model
43
+ data/
44
+ models/
45
+ logs/
Muril-Model/Muril-Model/Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Lightweight Python image
2
+ FROM python:3.10-slim
3
+
4
+ # Disable CUDA & set cache
5
+ ENV TORCH_DISABLE_CUDA=1
6
+ ENV HF_HOME=/app/hf_cache
7
+ ENV TRANSFORMERS_CACHE=/app/hf_cache
8
+
9
+ # Working directory
10
+ WORKDIR /app
11
+
12
+ # Install dependencies
13
+ COPY requirements.txt .
14
+ RUN apt-get update && apt-get install -y git && \
15
+ pip install --no-cache-dir -r requirements.txt && \
16
+ rm -rf /var/lib/apt/lists/*
17
+
18
+ # Copy app code and model/data
19
+ COPY . .
20
+
21
+ # Make cache folder writable
22
+ RUN mkdir -p /app/hf_cache && chmod -R 777 /app/hf_cache
23
+
24
+ # Expose port
25
+ EXPOSE 8080
26
+
27
+ # Run FastAPI via uvicorn
28
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
Muril-Model/Muril-Model/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Muril Model
3
+ emoji: 🌖
4
+ colorFrom: gray
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Muril-Model/Muril-Model/answer_embeddings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fa2e749567247c8a15144c6a0b1d3423ae8a8a0054aee9f3cc2774f8b9cb555
3
+ size 83854959
Muril-Model/Muril-Model/main.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pandas as pd
4
+ from fastapi import FastAPI
5
+ from pydantic import BaseModel
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from huggingface_hub import snapshot_download
8
+
9
+ # --- Cache Configuration ---
10
+ os.environ["HF_HOME"] = "/app/hf_cache"
11
+ os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
12
+ os.environ["TORCH_DISABLE_CUDA"] = "1"
13
+
14
+ # --- Hugging Face Repo ---
15
+ HF_REPO = "Sp2503/Muril-Model"
16
+
17
+ # --- Download model & embeddings from Hugging Face Hub ---
18
+ print("📦 Downloading model & embeddings from Hugging Face Hub...")
19
+ model_dir = snapshot_download(repo_id=HF_REPO, repo_type="model")
20
+ print(f"✅ Model snapshot available at: {model_dir}")
21
+
22
+ MODEL_PATH = model_dir
23
+ CSV_PATH = os.path.join(model_dir, "muril_multilingual_dataset.csv")
24
+ EMBED_PATH = os.path.join(model_dir, "answer_embeddings.pt")
25
+
26
+ # --- Load resources ---
27
+ print("⚙️ Loading model and embeddings...")
28
+ model = SentenceTransformer(MODEL_PATH)
29
+ df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
30
+ answer_embeddings = torch.load(EMBED_PATH, map_location="cpu")
31
+ print("✅ Model and embeddings loaded successfully.")
32
+
33
+ # --- FastAPI Setup ---
34
+ app = FastAPI(title="MuRIL Multilingual QA API")
35
+
36
+ class QueryRequest(BaseModel):
37
+ question: str
38
+ lang: str = None
39
+
40
+ class QAResponse(BaseModel):
41
+ answer: str
42
+
43
+ @app.get("/")
44
+ def root():
45
+ return {"status": "✅ API ready", "model_loaded": True}
46
+
47
+ @app.post("/get-answer", response_model=QAResponse)
48
+ def get_answer_endpoint(request: QueryRequest):
49
+ question_text = request.question.strip()
50
+ lang_filter = request.lang
51
+
52
+ filtered_df = df
53
+ filtered_embeddings = answer_embeddings
54
+ if 'lang' in df.columns and lang_filter:
55
+ mask = df['lang'] == lang_filter
56
+ filtered_df = df[mask].reset_index(drop=True)
57
+ filtered_embeddings = answer_embeddings[mask.values]
58
+
59
+ if len(filtered_df) == 0:
60
+ return {"answer": f"No data found for language '{lang_filter}'."}
61
+
62
+ question_emb = model.encode(question_text, convert_to_tensor=True)
63
+ cosine_scores = util.pytorch_cos_sim(question_emb, filtered_embeddings)
64
+ best_idx = torch.argmax(cosine_scores).item()
65
+ answer = filtered_df.iloc[best_idx]['answer']
66
+ return {"answer": answer}
67
+
68
+ if __name__ == "__main__":
69
+ import uvicorn
70
+ uvicorn.run("main:app", host="0.0.0.0", port=8080)
Muril-Model/Muril-Model/muril_combined_multilingual_model/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
Muril-Model/Muril-Model/muril_combined_multilingual_model/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.56.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 197285
25
+ }
Muril-Model/Muril-Model/muril_combined_multilingual_model/config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "SentenceTransformer",
3
+ "__version__": {
4
+ "sentence_transformers": "5.1.1",
5
+ "transformers": "4.56.2",
6
+ "pytorch": "2.8.0+cu126"
7
+ },
8
+ "prompts": {
9
+ "query": "",
10
+ "document": ""
11
+ },
12
+ "default_prompt_name": null,
13
+ "similarity_fn_name": "cosine"
14
+ }
Muril-Model/Muril-Model/muril_combined_multilingual_model/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
Muril-Model/Muril-Model/muril_combined_multilingual_model/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
Muril-Model/Muril-Model/muril_combined_multilingual_model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
Muril-Model/Muril-Model/muril_combined_multilingual_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Muril-Model/Muril-Model/muril_combined_multilingual_model/tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "extra_special_tokens": {},
49
+ "lowercase": false,
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "never_split": null,
53
+ "pad_token": "[PAD]",
54
+ "sep_token": "[SEP]",
55
+ "strip_accents": false,
56
+ "tokenize_chinese_chars": true,
57
+ "tokenizer_class": "BertTokenizer",
58
+ "unk_token": "[UNK]"
59
+ }
Muril-Model/Muril-Model/muril_combined_multilingual_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
Muril-Model/Muril-Model/muril_multilingual_dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cca0ed98f57664e558825059722272d15fe99f5238969e95f523629fb50cec
3
+ size 16996056
Muril-Model/Muril-Model/precompute_embeddings.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ MODEL_PATH = './muril_combined_multilingual_model'
6
+ CSV_PATH = './muril_multilingual_dataset.csv'
7
+ EMB_PATH = './answer_embeddings.pt'
8
+
9
+ print("🔄 Precomputing embeddings...")
10
+ model = SentenceTransformer(MODEL_PATH)
11
+ df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
12
+
13
+ answers = df['answer'].tolist()
14
+ answer_embeddings = model.encode(answers, convert_to_tensor=True)
15
+ torch.save(answer_embeddings, EMB_PATH)
16
+
17
+ print(f"✅ Saved {len(answers)} embeddings to {EMB_PATH}")
Muril-Model/Muril-Model/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.118.0
2
+ uvicorn==0.37.0
3
+ torch==2.1.0
4
+ sentence-transformers==5.1.1
5
+ transformers==4.43.3
6
+ numpy<2
7
+ pandas==2.1.1
8
+ langdetect==1.0.9
9
+ requests==2.31.0
10
+ tqdm==4.65.0
11
+ PyMuPDF==1.23.0
12
+ huggingface_hub==0.23.4
Muril-Model/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Muril Model
3
+ emoji: 🌖
4
+ colorFrom: gray
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Muril-Model/answer_embeddings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fa2e749567247c8a15144c6a0b1d3423ae8a8a0054aee9f3cc2774f8b9cb555
3
+ size 83854959
Muril-Model/app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # app.py
2
+ import importlib
3
+
4
+ app = importlib.import_module("main").app
Muril-Model/main.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pandas as pd
4
+ from fastapi import FastAPI
5
+ from pydantic import BaseModel
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from langdetect import detect
8
+ from huggingface_hub import hf_hub_download
9
+ import threading
10
+ import time
11
+
12
+ # --- Cache Configuration ---
13
+ os.environ["HF_HOME"] = "/app/hf_cache"
14
+ os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
15
+ os.environ["TORCH_DISABLE_CUDA"] = "1"
16
+
17
+ # --- Paths ---
18
+ MODEL_PATH = './muril_combined_multilingual_model'
19
+ CSV_PATH = './muril_multilingual_dataset.csv'
20
+ HF_REPO = "Sp2503/muril-dataset"
21
+ HF_FILE = "answer_embeddings.pt"
22
+
23
+ # --- FastAPI Setup ---
24
+ app = FastAPI(title="MuRIL Multilingual QA API")
25
+
26
+ # Global variables
27
+ model = None
28
+ df = None
29
+ answer_embeddings = None
30
+
31
+ # --- Helper: Load embeddings from Hugging Face ---
32
+ def load_embeddings():
33
+ print("📥 Downloading embeddings from Hugging Face...")
34
+ hf_path = hf_hub_download(
35
+ repo_id=HF_REPO,
36
+ filename=HF_FILE,
37
+ repo_type="dataset",
38
+ cache_dir="/tmp"
39
+ )
40
+ print(f"✅ Embeddings available at {hf_path}")
41
+ return torch.load(hf_path, map_location="cpu")
42
+
43
+ # --- Resource Loader ---
44
+ def load_resources():
45
+ global model, df, answer_embeddings
46
+ try:
47
+ print("⚙️ Loading model and dataset...")
48
+ model = SentenceTransformer(MODEL_PATH)
49
+ df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
50
+ answer_embeddings = load_embeddings()
51
+ print("✅ Model and embeddings ready.")
52
+ except Exception as e:
53
+ print(f"❌ Error loading resources: {e}")
54
+
55
+ # --- Background Loader Thread ---
56
+ @app.on_event("startup")
57
+ def startup_event():
58
+ print("🚀 Starting background model loader thread...")
59
+ thread = threading.Thread(target=load_resources)
60
+ thread.daemon = True
61
+ thread.start()
62
+
63
+ @app.get("/")
64
+ def root():
65
+ ready = model is not None and df is not None and answer_embeddings is not None
66
+ return {"status": "✅ Running MuRIL QA API", "model_loaded": ready}
67
+
68
+ class QueryRequest(BaseModel):
69
+ question: str
70
+ lang: str = None
71
+
72
+ class QAResponse(BaseModel):
73
+ answer: str
74
+
75
+ @app.post("/get-answer", response_model=QAResponse)
76
+ def get_answer_endpoint(request: QueryRequest):
77
+ if model is None or df is None or answer_embeddings is None:
78
+ return {"answer": "⏳ Model still loading, please try again shortly."}
79
+
80
+ question_text = request.question.strip()
81
+ lang_filter = request.lang or detect(question_text)
82
+
83
+ filtered_df = df
84
+ filtered_embeddings = answer_embeddings
85
+ if 'lang' in df.columns and lang_filter:
86
+ mask = df['lang'] == lang_filter
87
+ filtered_df = df[mask].reset_index(drop=True)
88
+ filtered_embeddings = answer_embeddings[mask.values]
89
+
90
+ if len(filtered_df) == 0:
91
+ return {"answer": f"⚠️ No data found for language '{lang_filter}'."}
92
+
93
+ question_emb = model.encode(question_text, convert_to_tensor=True)
94
+ cosine_scores = util.pytorch_cos_sim(question_emb, filtered_embeddings)
95
+ best_idx = torch.argmax(cosine_scores).item()
96
+ answer = filtered_df.iloc[best_idx]['answer']
97
+ return {"answer": answer}
98
+
99
+ # --- Keep-alive thread for Spaces ---
100
+ def keep_alive():
101
+ while True:
102
+ time.sleep(60)
103
+
104
+ if __name__ == "__main__":
105
+ import uvicorn
106
+ threading.Thread(target=keep_alive, daemon=True).start()
107
+ uvicorn.run("main:app", host="0.0.0.0", port=8080)
Muril-Model/muril_combined_multilingual_model/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
Muril-Model/muril_combined_multilingual_model/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.56.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 197285
25
+ }
Muril-Model/muril_combined_multilingual_model/config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "SentenceTransformer",
3
+ "__version__": {
4
+ "sentence_transformers": "5.1.1",
5
+ "transformers": "4.56.2",
6
+ "pytorch": "2.8.0+cu126"
7
+ },
8
+ "prompts": {
9
+ "query": "",
10
+ "document": ""
11
+ },
12
+ "default_prompt_name": null,
13
+ "similarity_fn_name": "cosine"
14
+ }
Muril-Model/muril_combined_multilingual_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efc762382be48593e4d42fd7ddb7ba61a013d7447bc027b5d4f345d42bf8427a
3
+ size 950247272
Muril-Model/muril_combined_multilingual_model/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
Muril-Model/muril_combined_multilingual_model/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
Muril-Model/muril_combined_multilingual_model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
Muril-Model/muril_combined_multilingual_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Muril-Model/muril_combined_multilingual_model/tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "extra_special_tokens": {},
49
+ "lowercase": false,
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "never_split": null,
53
+ "pad_token": "[PAD]",
54
+ "sep_token": "[SEP]",
55
+ "strip_accents": false,
56
+ "tokenize_chinese_chars": true,
57
+ "tokenizer_class": "BertTokenizer",
58
+ "unk_token": "[UNK]"
59
+ }
Muril-Model/muril_combined_multilingual_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
Muril-Model/muril_multilingual_dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cca0ed98f57664e558825059722272d15fe99f5238969e95f523629fb50cec
3
+ size 16996056
Muril-Model/precompute_embeddings.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ MODEL_PATH = './muril_combined_multilingual_model'
6
+ CSV_PATH = './muril_multilingual_dataset.csv'
7
+ EMB_PATH = './answer_embeddings.pt'
8
+
9
+ print("🔄 Precomputing embeddings...")
10
+ model = SentenceTransformer(MODEL_PATH)
11
+ df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
12
+
13
+ answers = df['answer'].tolist()
14
+ answer_embeddings = model.encode(answers, convert_to_tensor=True)
15
+ torch.save(answer_embeddings, EMB_PATH)
16
+
17
+ print(f"✅ Saved {len(answers)} embeddings to {EMB_PATH}")
Muril-Model/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.118.0
2
+ uvicorn==0.37.0
3
+ torch==2.1.0
4
+ sentence-transformers==5.1.1
5
+ transformers==4.43.3
6
+ numpy<2
7
+ pandas==2.1.1
8
+ langdetect==1.0.9
9
+ requests==2.31.0
10
+ tqdm==4.65.0
11
+ PyMuPDF==1.23.0
12
+ huggingface_hub==0.23.4
New Text Document.txt ADDED
File without changes
__pycache__/main.cpython-312.pyc ADDED
Binary file (4.15 kB). View file
 
murilmodeltraining.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
pdfs/constitution_1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:819ba7adc5e5ae4063f26c82472283ba682ddb662e5a7f864cf26984d99b26ed
3
+ size 2413611
pdfs/constitution_10.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:368194dab2e116625e62c9f9f6164d9da142fd4a255ec92fcd93e6f4bfcd71c9
3
+ size 16375014
pdfs/constitution_11.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:690b780cf35413e92949ad48c92f9c55241623adf9fcddc6cc31c8ef2ca68a38
3
+ size 6510505
pdfs/constitution_12.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3e78b81b1551aa11922dd56a128947e483347870518fb40323a20db55b48d0e
3
+ size 30214317
pdfs/constitution_2.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7646d0ad9ef704dd48adddd84f1c1649c297a9233e2e3a4aa21d6e470c0303b7
3
+ size 5499505
pdfs/constitution_3.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdd995f5349c7f32a544a0ae85b389820a0475b9558ea4c1e5f0a2b0a640c39d
3
+ size 8559896
pdfs/constitution_4.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dae1ef1ad3c201563563b68152b685b2b8ca0a307a75a53e818f1bb81488675
3
+ size 6685693
pdfs/constitution_5.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2409ce877cd34104df1831aa241ab6077ca0c5845451fbf68aa85693850cdca
3
+ size 74393982