Spaces:

subbunanepalli
/

spacee

Build error

App Files Files Community

subbunanepalli commited on Jun 7, 2025

Commit

c44a5b8

verified ·

1 Parent(s): 4e668dc

Upload 6 files

Browse files

Files changed (5) hide show

main.py +82 -39
requirements.txt +5 -3
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
vocab.txt +8 -0

main.py CHANGED Viewed

@@ -1,52 +1,95 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-import pickle
-import pandas as pd
-from pydantic import BaseModel
-from typing import List
 import os
-app = FastAPI()
-MODEL_PATH = "tfidf_models.pkl"
-class TrainRequest(BaseModel):
-    texts: List[str]
-    labels: List[List[int]]
-class PredictRequest(BaseModel):
-    texts: List[str]
-@app.post("/train")
-def train_model(request: TrainRequest):
-    from sklearn.feature_extraction.text import TfidfVectorizer
-    from sklearn.multioutput import MultiOutputClassifier
-    from sklearn.linear_model import LogisticRegression
-    if len(request.texts) != len(request.labels):
-        raise HTTPException(status_code=400, detail="Texts and labels length mismatch")
-    X = request.texts
-    y = pd.DataFrame(request.labels)
-    vectorizer = TfidfVectorizer()
-    X_tfidf = vectorizer.fit_transform(X)
-    classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000))
-    classifier.fit(X_tfidf, y)
-    with open(MODEL_PATH, "wb") as f:
-        pickle.dump((vectorizer, classifier), f)
-    return {"message": "Model trained and saved successfully."}
 @app.post("/predict")
 def predict(request: PredictRequest):
-    if not os.path.exists(MODEL_PATH):
-        raise HTTPException(status_code=404, detail="Model not found. Train the model first.")
-    with open(MODEL_PATH, "rb") as f:
-        vectorizer, classifier = pickle.load(f)
-    X_tfidf = vectorizer.transform(request.texts)
-    predictions = classifier.predict(X_tfidf)
-    return {"predictions": predictions.tolist()}

 import os
+import requests
+import torch
+import torch.nn as nn
+from transformers import BertTokenizer, BertModel
+from fastapi import FastAPI
+from pydantic import BaseModel
+# Constants
+LABEL_COLUMNS = [
+    'Red_Flag_Reason', 'Maker_Action', 'Escalation_Level',
+    'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome'
+]
+PRETRAINED_MODEL_NAME = 'bert-base-uncased'
+MAX_LEN = 128
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = "/tmp/bert_model.pth"
+FILE_ID = "1qqmBxbxM0CmxPGC4sqO6vLJAe-Kikiv4"
+# Google Drive download logic
+def download_from_google_drive(file_id, dest_path):
+    URL = "https://docs.google.com/uc?export=download"
+    session = requests.Session()
+    response = session.get(URL, params={'id': file_id}, stream=True)
+    def get_confirm_token(response):
+        for key, value in response.cookies.items():
+            if key.startswith('download_warning'):
+                return value
+        return None
+    token = get_confirm_token(response)
+    if token:
+        params = {'id': file_id, 'confirm': token}
+        response = session.get(URL, params=params, stream=True)
+    with open(dest_path, "wb") as f:
+        for chunk in response.iter_content(32768):
+            if chunk:
+                f.write(chunk)
+if not os.path.exists(MODEL_PATH):
+    print("Downloading model from Google Drive...")
+    download_from_google_drive(FILE_ID, MODEL_PATH)
+# Model Definition
+class BertMultiOutput(nn.Module):
+    def __init__(self, num_labels_per_output):
+        super().__init__()
+        self.bert = BertModel.from_pretrained(PRETRAINED_MODEL_NAME)
+        self.dropout = nn.Dropout(0.3)
+        self.classifiers = nn.ModuleList([
+            nn.Linear(self.bert.config.hidden_size, n_labels)
+            for n_labels in num_labels_per_output
+        ])
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        pooled_output = self.dropout(outputs.pooler_output)
+        logits = [classifier(pooled_output) for classifier in self.classifiers]
+        return logits
+# Load model and tokenizer
+checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
+label_encoders = checkpoint['label_encoders']
+num_labels_list = [len(le.classes_) for le in label_encoders.values()]
+model = BertMultiOutput(num_labels_list).to(DEVICE)
+model.load_state_dict(checkpoint['model_state_dict'])
+model.eval()
+tokenizer = BertTokenizer.from_pretrained("bert_tokenizer/")
+# FastAPI app
+app = FastAPI()
+class PredictRequest(BaseModel):
+    text: str
+@app.get("/")
+def root():
+    return {"message": "Multi-output BERT is ready!"}
 @app.post("/predict")
 def predict(request: PredictRequest):
+    inputs = tokenizer(
+        request.text,
+        truncation=True,
+        padding='max_length',
+        max_length=MAX_LEN,
+        return_tensors="pt"
+    ).to(DEVICE)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        preds = [torch.argmax(output, dim=1).item() for output in outputs]
+    decoded = {
+        label: label_encoders[label].inverse_transform([pred])[0]
+        for label, pred in zip(LABEL_COLUMNS, preds)
+    }
+    return {"predictions": decoded}

requirements.txt CHANGED Viewed

@@ -1,5 +1,7 @@
 fastapi
 uvicorn
-scikit-learn
-pandas
-python-multipart

 fastapi
 uvicorn
+transformers
+torch
+pydantic
+requests
+scikit-learn

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"cls_token": "[CLS]", "sep_token": "[SEP]"}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "model_max_length": 512}

vocab.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+a
+b
+c