Spaces:

Kabila22
/

kabilan_model_inference

Sleeping

App Files Files Community

Kabila22 commited on Feb 17, 2025

Commit

9cdc952

1 Parent(s): bf426c4

Added trained model

Browse files

Files changed (7) hide show

Dockerfile +28 -0
app/__pycache__/main.cpython-313.pyc +0 -0
app/main.py +106 -0
app/requirements.txt +0 -0
data/sms_process_data_main.xlsx +0 -0
models/gte-base-en-v1.5.pickle +0 -0
models/train_modle.py +25 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+    ### `project/Dockerfile`
+    ```dockerfile
+    FROM python:3.9
+    WORKDIR /app
+    COPY . /app
+    ENV HF_HOME=/app/.cache
+    # Create cache directory with appropriate permissions
+    RUN mkdir -p /app/.cache/huggingface/hub && \
+        chmod -R 777 /app/.cache && \
+        chmod -R 777 /app/.cache/huggingface
+    # Install dependencies
+    RUN pip install --upgrade pip
+    RUN pip install --no-cache-dir -r requirements.txt
+    # Copy requirements again (optional, for clarity in your original structure)
+    COPY --chown=user ./requirements.txt requirements.txt
+    RUN pip install --no-cache-dir --upgrade -r requirements.txt
+    # Expose port
+    EXPOSE 7860
+    # Run the FastAPI app
+    CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (6.13 kB). View file

app/main.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from sentence_transformers import SentenceTransformer, util
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report
+import pickle
+import os
+# Initialize FastAPI app
+app = FastAPI()
+# Load embedding models
+embedding_model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)
+similarity_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Define request body schemas
+class TextInput(BaseModel):
+    text: str
+class SimilarityInput(BaseModel):
+    text1: str
+    text2: str
+class SMSInput(BaseModel):
+    sms: str
+# Load dataset
+file_name = r"data/sms_process_data_main.xlsx"
+sheet = "Sheet1"
+df = pd.read_excel(file_name, sheet_name=sheet)
+# Split dataset
+X_train, X_test, y_train, y_test = train_test_split(df['MessageText'], df['label'], test_size=0.2, random_state=42)
+# Train or load the model
+model_path = "models/gte-base-en-v1.5.pickle"
+if os.path.exists(model_path):
+    # Load pre-trained classifier from pickle
+    with open(model_path, 'rb') as f:
+        classifier = pickle.load(f)
+else:
+    # Train logistic regression model
+    X_train_embeddings = embedding_model.encode(X_train.tolist(), convert_to_tensor=True).cpu().numpy()
+    classifier = LogisticRegression(max_iter=1000)
+    classifier.fit(X_train_embeddings, y_train)
+    # Save the trained model to pickle
+    with open(model_path, 'wb') as f:
+        pickle.dump(classifier, f)
+# Evaluate model (optional, for logging purposes)
+X_test_embeddings = embedding_model.encode(X_test.tolist(), convert_to_tensor=True).cpu().numpy()
+y_pred = classifier.predict(X_test_embeddings)
+accuracy = accuracy_score(y_test, y_pred)
+print(f"Model Accuracy: {accuracy:.4f}")
+print(classification_report(y_test, y_pred))
+# Home route
+@app.get("/")
+async def home():
+    return {"message": "Welcome to the embedding, similarity, and SMS classification API. Use /docs to test the endpoints."}
+# Endpoint for generating embeddings
+@app.post("/embed")
+async def generate_embedding(text_input: TextInput):
+    try:
+        embedding = embedding_model.encode(text_input.text, convert_to_tensor=True).cpu().numpy()
+        rounded_embedding = np.round(embedding, decimals=2).tolist()
+        dimensions = len(rounded_embedding)
+        return {"dimensions": dimensions, "embeddings": rounded_embedding}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Endpoint for calculating cosine similarity
+@app.post("/similarity")
+async def calculate_similarity(similarity_input: SimilarityInput):
+    try:
+        embeddings1 = similarity_model.encode(similarity_input.text1, convert_to_tensor=True)
+        embeddings2 = similarity_model.encode(similarity_input.text2, convert_to_tensor=True)
+        cosine_similarity = util.cos_sim(embeddings1, embeddings2).item()
+        return {"text1": similarity_input.text1, "text2": similarity_input.text2, "cosine_similarity": round(cosine_similarity, 4)}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Endpoint for SMS classification
+@app.post("/classify_sms")
+async def classify_sms(sms_input: SMSInput):
+    try:
+        # Encode the input SMS
+        sms_embedding = embedding_model.encode(sms_input.sms, convert_to_tensor=True).cpu().numpy()
+        # Predict the label using the trained model
+        prediction = classifier.predict([sms_embedding])
+        # Return the predicted label
+        return {"sms": sms_input.sms, "classification": prediction[0]}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Run FastAPI app
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

app/requirements.txt ADDED Viewed

File without changes

data/sms_process_data_main.xlsx ADDED Viewed

Binary file (46.8 kB). View file

models/gte-base-en-v1.5.pickle ADDED Viewed

Binary file (6.87 kB). View file

models/train_modle.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from sentence_transformers import SentenceTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+import pandas as pd
+import pickle
+# Load dataset
+df = pd.read_excel("data/sms_process_data_main.xlsx", sheet_name="Sheet1")
+X_train, _, y_train, _ = train_test_split(df['MessageText'], df['label'], test_size=0.2, random_state=42)
+# Load embedding model
+embedding_model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)
+# Generate embeddings
+X_train_embeddings = embedding_model.encode(X_train.tolist(), convert_to_tensor=True).cpu().numpy()
+# Train model
+classifier = LogisticRegression(max_iter=1000)
+classifier.fit(X_train_embeddings, y_train)
+# Save to pickle
+with open("models/gte-base-en-v1.5.pickle", "wb") as f:
+    pickle.dump(classifier, f)
+print("Model saved to gte-base-en-v1.5.pickle")