Sp2503 commited on
Commit
d9d3ffa
·
verified ·
1 Parent(s): fc35038

Delete Muril-Model

Browse files
Muril-Model/.gitattributes DELETED
@@ -1,36 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- muril_multilingual_dataset.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Muril-Model/.gitignore DELETED
@@ -1,45 +0,0 @@
1
- # Python
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
- *.so
6
- .Python
7
- build/
8
- develop-eggs/
9
- dist/
10
- downloads/
11
- eggs/
12
- .eggs/
13
- lib/
14
- lib64/
15
- parts/
16
- sdist/
17
- var/
18
- wheels/
19
- *.egg-info/
20
- .installed.cfg
21
- *.egg
22
-
23
- # Virtual Environment
24
- venv/
25
- env/
26
- .env/
27
- .venv/
28
-
29
- # IDEs and Editors
30
- .idea/
31
- .vscode/
32
- *.swp
33
- *.swo
34
- *~
35
-
36
- # Jupyter Notebook
37
- .ipynb_checkpoints
38
-
39
- # Model files and data
40
- *.h5
41
- *.pkl
42
- *.model
43
- data/
44
- models/
45
- logs/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Muril-Model/Dockerfile DELETED
@@ -1,34 +0,0 @@
1
- # Use lightweight Python base image
2
- FROM python:3.10-slim
3
-
4
- # Disable CUDA & set cache locations
5
- ENV TORCH_DISABLE_CUDA=1
6
- ENV TRANSFORMERS_CACHE=/app/hf_cache
7
- ENV HF_HOME=/app/hf_cache
8
-
9
- # Working directory
10
- WORKDIR /app
11
-
12
- # Copy dependency list
13
- COPY requirements.txt .
14
-
15
- # Install dependencies efficiently
16
- RUN apt-get update && apt-get install -y git && \
17
- pip install --no-cache-dir -r requirements.txt && \
18
- rm -rf /var/lib/apt/lists/*
19
-
20
- # Copy app code
21
- COPY . .
22
-
23
- # Create cache folder (writable)
24
- RUN mkdir -p /app/hf_cache && chmod -R 777 /app/hf_cache
25
-
26
- # Expose port for FastAPI
27
- EXPOSE 8080
28
-
29
- # Health check (optional for Cloud Run / Spaces)
30
- HEALTHCHECK CMD curl --fail http://localhost:8080/ || exit 1
31
-
32
- # Run the FastAPI app
33
- # Using "app:app" ensures fast startup since app.py imports main.py lazily.
34
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Muril-Model/README.md DELETED
@@ -1,10 +0,0 @@
1
- ---
2
- title: Muril Model
3
- emoji: 🌖
4
- colorFrom: gray
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
Muril-Model/answer_embeddings.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fa2e749567247c8a15144c6a0b1d3423ae8a8a0054aee9f3cc2774f8b9cb555
3
- size 83854959
 
 
 
 
Muril-Model/app.py DELETED
@@ -1,4 +0,0 @@
1
- # app.py
2
- import importlib
3
-
4
- app = importlib.import_module("main").app
 
 
 
 
 
Muril-Model/main.py DELETED
@@ -1,107 +0,0 @@
1
- import os
2
- import torch
3
- import pandas as pd
4
- from fastapi import FastAPI
5
- from pydantic import BaseModel
6
- from sentence_transformers import SentenceTransformer, util
7
- from langdetect import detect
8
- from huggingface_hub import hf_hub_download
9
- import threading
10
- import time
11
-
12
- # --- Cache Configuration ---
13
- os.environ["HF_HOME"] = "/app/hf_cache"
14
- os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
15
- os.environ["TORCH_DISABLE_CUDA"] = "1"
16
-
17
- # --- Paths ---
18
- MODEL_PATH = './muril_combined_multilingual_model'
19
- CSV_PATH = './muril_multilingual_dataset.csv'
20
- HF_REPO = "Sp2503/muril-dataset"
21
- HF_FILE = "answer_embeddings.pt"
22
-
23
- # --- FastAPI Setup ---
24
- app = FastAPI(title="MuRIL Multilingual QA API")
25
-
26
- # Global variables
27
- model = None
28
- df = None
29
- answer_embeddings = None
30
-
31
- # --- Helper: Load embeddings from Hugging Face ---
32
- def load_embeddings():
33
- print("📥 Downloading embeddings from Hugging Face...")
34
- hf_path = hf_hub_download(
35
- repo_id=HF_REPO,
36
- filename=HF_FILE,
37
- repo_type="dataset",
38
- cache_dir="/tmp"
39
- )
40
- print(f"✅ Embeddings available at {hf_path}")
41
- return torch.load(hf_path, map_location="cpu")
42
-
43
- # --- Resource Loader ---
44
- def load_resources():
45
- global model, df, answer_embeddings
46
- try:
47
- print("⚙️ Loading model and dataset...")
48
- model = SentenceTransformer(MODEL_PATH)
49
- df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
50
- answer_embeddings = load_embeddings()
51
- print("✅ Model and embeddings ready.")
52
- except Exception as e:
53
- print(f"❌ Error loading resources: {e}")
54
-
55
- # --- Background Loader Thread ---
56
- @app.on_event("startup")
57
- def startup_event():
58
- print("🚀 Starting background model loader thread...")
59
- thread = threading.Thread(target=load_resources)
60
- thread.daemon = True
61
- thread.start()
62
-
63
- @app.get("/")
64
- def root():
65
- ready = model is not None and df is not None and answer_embeddings is not None
66
- return {"status": "✅ Running MuRIL QA API", "model_loaded": ready}
67
-
68
- class QueryRequest(BaseModel):
69
- question: str
70
- lang: str = None
71
-
72
- class QAResponse(BaseModel):
73
- answer: str
74
-
75
- @app.post("/get-answer", response_model=QAResponse)
76
- def get_answer_endpoint(request: QueryRequest):
77
- if model is None or df is None or answer_embeddings is None:
78
- return {"answer": "⏳ Model still loading, please try again shortly."}
79
-
80
- question_text = request.question.strip()
81
- lang_filter = request.lang or detect(question_text)
82
-
83
- filtered_df = df
84
- filtered_embeddings = answer_embeddings
85
- if 'lang' in df.columns and lang_filter:
86
- mask = df['lang'] == lang_filter
87
- filtered_df = df[mask].reset_index(drop=True)
88
- filtered_embeddings = answer_embeddings[mask.values]
89
-
90
- if len(filtered_df) == 0:
91
- return {"answer": f"⚠️ No data found for language '{lang_filter}'."}
92
-
93
- question_emb = model.encode(question_text, convert_to_tensor=True)
94
- cosine_scores = util.pytorch_cos_sim(question_emb, filtered_embeddings)
95
- best_idx = torch.argmax(cosine_scores).item()
96
- answer = filtered_df.iloc[best_idx]['answer']
97
- return {"answer": answer}
98
-
99
- # --- Keep-alive thread for Spaces ---
100
- def keep_alive():
101
- while True:
102
- time.sleep(60)
103
-
104
- if __name__ == "__main__":
105
- import uvicorn
106
- threading.Thread(target=keep_alive, daemon=True).start()
107
- uvicorn.run("main:app", host="0.0.0.0", port=8080)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Muril-Model/muril_multilingual_dataset.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:09cca0ed98f57664e558825059722272d15fe99f5238969e95f523629fb50cec
3
- size 16996056
 
 
 
 
Muril-Model/precompute_embeddings.py DELETED
@@ -1,17 +0,0 @@
1
- import torch
2
- import pandas as pd
3
- from sentence_transformers import SentenceTransformer
4
-
5
- MODEL_PATH = './muril_combined_multilingual_model'
6
- CSV_PATH = './muril_multilingual_dataset.csv'
7
- EMB_PATH = './answer_embeddings.pt'
8
-
9
- print("🔄 Precomputing embeddings...")
10
- model = SentenceTransformer(MODEL_PATH)
11
- df = pd.read_csv(CSV_PATH).dropna(subset=['question', 'answer'])
12
-
13
- answers = df['answer'].tolist()
14
- answer_embeddings = model.encode(answers, convert_to_tensor=True)
15
- torch.save(answer_embeddings, EMB_PATH)
16
-
17
- print(f"✅ Saved {len(answers)} embeddings to {EMB_PATH}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Muril-Model/requirements.txt DELETED
@@ -1,12 +0,0 @@
1
- fastapi==0.118.0
2
- uvicorn==0.37.0
3
- torch==2.1.0
4
- sentence-transformers==5.1.1
5
- transformers==4.43.3
6
- numpy<2
7
- pandas==2.1.1
8
- langdetect==1.0.9
9
- requests==2.31.0
10
- tqdm==4.65.0
11
- PyMuPDF==1.23.0
12
- huggingface_hub==0.23.4