Spaces:
Runtime error
Runtime error
Ezhil
commited on
Commit
·
1aa4489
0
Parent(s):
Initial commit
Browse files- Dockerfile +20 -0
- README.md +8 -0
- data/sms_process_data_main.xlsx +0 -0
- main.py +34 -0
- models/sms_classifier_model.pkl +0 -0
- models/tfidf_vectorizer.pkl +0 -0
- requirements.txt +8 -0
- schemas/input_schemas.py +23 -0
- services/sms_service.py +58 -0
- services/train_model.py +42 -0
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python image as a base image
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Copy the requirements file into the container
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install dependencies
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy the entire app folder into the container
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Expose the port the app runs on
|
| 17 |
+
EXPOSE 7860
|
| 18 |
+
|
| 19 |
+
# Command to run the FastAPI app using Uvicorn
|
| 20 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Embedding Restapi
|
| 3 |
+
emoji: 😻
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
data/sms_process_data_main.xlsx
ADDED
|
Binary file (42.2 kB). View file
|
|
|
main.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from services.sms_service import predict_label, compute_cosine_similarity, compute_embeddings
|
| 3 |
+
from schemas.input_schemas import CosineSimilarityInput, MessageInput, EmbeddingInput
|
| 4 |
+
|
| 5 |
+
app = FastAPI()
|
| 6 |
+
|
| 7 |
+
# 🚀 1️⃣ Homepage Endpoint
|
| 8 |
+
@app.get("/")
|
| 9 |
+
async def home():
|
| 10 |
+
return {"message": "Welcome to SMS Classification API"}
|
| 11 |
+
|
| 12 |
+
# 🔢 2️⃣ Cosine Similarity Endpoint
|
| 13 |
+
@app.post("/cosine_similarity")
|
| 14 |
+
async def get_cosine_similarity(input_data: CosineSimilarityInput):
|
| 15 |
+
try:
|
| 16 |
+
return await compute_cosine_similarity(input_data.text1, input_data.text2)
|
| 17 |
+
except Exception as e:
|
| 18 |
+
raise HTTPException(status_code=500, detail=f"Error computing similarity: {str(e)}")
|
| 19 |
+
|
| 20 |
+
# 📩 3️⃣ SMS Classification Endpoint
|
| 21 |
+
@app.post("/predict_label")
|
| 22 |
+
async def classify_message(input_data: MessageInput):
|
| 23 |
+
try:
|
| 24 |
+
return await predict_label(input_data.message)
|
| 25 |
+
except Exception as e:
|
| 26 |
+
raise HTTPException(status_code=500, detail=f"Error predicting label: {str(e)}")
|
| 27 |
+
|
| 28 |
+
# 📊 4️⃣ Text Embedding Endpoint
|
| 29 |
+
@app.post("/compute_embeddings")
|
| 30 |
+
async def get_embeddings(input_data: EmbeddingInput):
|
| 31 |
+
try:
|
| 32 |
+
return await compute_embeddings(input_data.message)
|
| 33 |
+
except Exception as e:
|
| 34 |
+
raise HTTPException(status_code=500, detail=f"Error computing embeddings: {str(e)}")
|
models/sms_classifier_model.pkl
ADDED
|
Binary file (21.3 kB). View file
|
|
|
models/tfidf_vectorizer.pkl
ADDED
|
Binary file (93.6 kB). View file
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
sentence-transformers
|
| 4 |
+
scikit-learn
|
| 5 |
+
pandas
|
| 6 |
+
numpy
|
| 7 |
+
openpyxl # Needed for reading Excel files
|
| 8 |
+
gradio
|
schemas/input_schemas.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
# Request schema for computing cosine similarity
|
| 5 |
+
class CosineSimilarityInput(BaseModel):
|
| 6 |
+
text1: str
|
| 7 |
+
text2: str
|
| 8 |
+
|
| 9 |
+
# Request schema for SMS classification
|
| 10 |
+
class MessageInput(BaseModel):
|
| 11 |
+
message: str
|
| 12 |
+
|
| 13 |
+
# Request schema for computing embeddings
|
| 14 |
+
class EmbeddingInput(BaseModel):
|
| 15 |
+
message: str
|
| 16 |
+
|
| 17 |
+
# Response schema for cosine similarity
|
| 18 |
+
class CosineSimilarityResponse(BaseModel):
|
| 19 |
+
cosine_similarity: float
|
| 20 |
+
|
| 21 |
+
# Response schema for embeddings
|
| 22 |
+
class EmbeddingResponse(BaseModel):
|
| 23 |
+
embeddings: List[List[float]]
|
services/sms_service.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
+
from fastapi import HTTPException
|
| 6 |
+
from schemas.input_schemas import CosineSimilarityResponse, EmbeddingResponse
|
| 7 |
+
|
| 8 |
+
# Load the trained model and vectorizer
|
| 9 |
+
def load_model():
|
| 10 |
+
model_path = "models/sms_classifier_model.pkl"
|
| 11 |
+
vectorizer_path = "models/tfidf_vectorizer.pkl"
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
with open(model_path, 'rb') as f:
|
| 15 |
+
classifier = pickle.load(f)
|
| 16 |
+
|
| 17 |
+
with open(vectorizer_path, 'rb') as f:
|
| 18 |
+
vectorizer = pickle.load(f)
|
| 19 |
+
|
| 20 |
+
return classifier, vectorizer
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise HTTPException(status_code=500, detail=f"Error loading model: {str(e)}")
|
| 23 |
+
|
| 24 |
+
async def predict_label(message: str):
|
| 25 |
+
try:
|
| 26 |
+
classifier, vectorizer = load_model()
|
| 27 |
+
# Vectorize the input message
|
| 28 |
+
message_vec = vectorizer.transform([message])
|
| 29 |
+
|
| 30 |
+
# Predict the label
|
| 31 |
+
label = classifier.predict(message_vec)[0]
|
| 32 |
+
return {"label": label}
|
| 33 |
+
except Exception as e:
|
| 34 |
+
raise HTTPException(status_code=500, detail=f"Error predicting label: {str(e)}")
|
| 35 |
+
|
| 36 |
+
async def compute_cosine_similarity(text1: str, text2: str):
|
| 37 |
+
try:
|
| 38 |
+
classifier, vectorizer = load_model()
|
| 39 |
+
|
| 40 |
+
# Vectorize the input texts
|
| 41 |
+
vec1 = vectorizer.transform([text1]).toarray()
|
| 42 |
+
vec2 = vectorizer.transform([text2]).toarray()
|
| 43 |
+
|
| 44 |
+
# Compute cosine similarity
|
| 45 |
+
cosine_sim = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
| 46 |
+
return CosineSimilarityResponse(cosine_similarity=cosine_sim[0][0])
|
| 47 |
+
except Exception as e:
|
| 48 |
+
raise HTTPException(status_code=500, detail=f"Error computing similarity: {str(e)}")
|
| 49 |
+
|
| 50 |
+
async def compute_embeddings(message: str):
|
| 51 |
+
try:
|
| 52 |
+
classifier, vectorizer = load_model()
|
| 53 |
+
|
| 54 |
+
# Vectorize the input message
|
| 55 |
+
embedding = vectorizer.transform([message]).toarray().tolist()
|
| 56 |
+
return EmbeddingResponse(embeddings=embedding)
|
| 57 |
+
except Exception as e:
|
| 58 |
+
raise HTTPException(status_code=500, detail=f"Error computing embeddings: {str(e)}")
|
services/train_model.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 3 |
+
from sklearn.linear_model import LogisticRegression
|
| 4 |
+
from sklearn.model_selection import train_test_split
|
| 5 |
+
import pickle
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
# Load the dataset
|
| 9 |
+
file_path = "data/sms_process_data_main.xlsx"
|
| 10 |
+
df = pd.read_excel(file_path)
|
| 11 |
+
|
| 12 |
+
# Prepare the features and labels
|
| 13 |
+
X = df['MessageText'] # SMS messages
|
| 14 |
+
y = df['label'] # Labels: 'Transaction' or 'Offer'
|
| 15 |
+
|
| 16 |
+
# Split the data into training and testing sets
|
| 17 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 18 |
+
|
| 19 |
+
# Initialize the TF-IDF Vectorizer
|
| 20 |
+
vectorizer = TfidfVectorizer(max_features=5000)
|
| 21 |
+
|
| 22 |
+
# Fit the vectorizer on the training data and transform the training data
|
| 23 |
+
X_train_vec = vectorizer.fit_transform(X_train)
|
| 24 |
+
|
| 25 |
+
# Initialize and train the logistic regression model
|
| 26 |
+
classifier = LogisticRegression()
|
| 27 |
+
classifier.fit(X_train_vec, y_train)
|
| 28 |
+
|
| 29 |
+
# Save the trained model and vectorizer
|
| 30 |
+
models_dir = "models"
|
| 31 |
+
if not os.path.exists(models_dir):
|
| 32 |
+
os.makedirs(models_dir)
|
| 33 |
+
|
| 34 |
+
# Save the classifier model
|
| 35 |
+
with open(os.path.join(models_dir, 'sms_classifier_model.pkl'), 'wb') as model_file:
|
| 36 |
+
pickle.dump(classifier, model_file)
|
| 37 |
+
|
| 38 |
+
# Save the vectorizer
|
| 39 |
+
with open(os.path.join(models_dir, 'tfidf_vectorizer.pkl'), 'wb') as vectorizer_file:
|
| 40 |
+
pickle.dump(vectorizer, vectorizer_file)
|
| 41 |
+
|
| 42 |
+
print("Model and vectorizer saved successfully!")
|