Kabila22 commited on
Commit
9cdc952
·
1 Parent(s): bf426c4

Added trained model

Browse files
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ### `project/Dockerfile`
3
+ ```dockerfile
4
+ FROM python:3.9
5
+
6
+ WORKDIR /app
7
+ COPY . /app
8
+
9
+ ENV HF_HOME=/app/.cache
10
+
11
+ # Create cache directory with appropriate permissions
12
+ RUN mkdir -p /app/.cache/huggingface/hub && \
13
+ chmod -R 777 /app/.cache && \
14
+ chmod -R 777 /app/.cache/huggingface
15
+
16
+ # Install dependencies
17
+ RUN pip install --upgrade pip
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy requirements again (optional, for clarity in your original structure)
21
+ COPY --chown=user ./requirements.txt requirements.txt
22
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
23
+
24
+ # Expose port
25
+ EXPOSE 7860
26
+
27
+ # Run the FastAPI app
28
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/__pycache__/main.cpython-313.pyc ADDED
Binary file (6.13 kB). View file
 
app/main.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.metrics import accuracy_score, classification_report
9
+ import pickle
10
+ import os
11
+
12
+ # Initialize FastAPI app
13
+ app = FastAPI()
14
+
15
+ # Load embedding models
16
+ embedding_model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)
17
+ similarity_model = SentenceTransformer("all-MiniLM-L6-v2")
18
+
19
+ # Define request body schemas
20
+ class TextInput(BaseModel):
21
+ text: str
22
+
23
+ class SimilarityInput(BaseModel):
24
+ text1: str
25
+ text2: str
26
+
27
+ class SMSInput(BaseModel):
28
+ sms: str
29
+
30
+ # Load dataset
31
+ file_name = r"data/sms_process_data_main.xlsx"
32
+ sheet = "Sheet1"
33
+ df = pd.read_excel(file_name, sheet_name=sheet)
34
+
35
+ # Split dataset
36
+ X_train, X_test, y_train, y_test = train_test_split(df['MessageText'], df['label'], test_size=0.2, random_state=42)
37
+
38
+ # Train or load the model
39
+ model_path = "models/gte-base-en-v1.5.pickle"
40
+ if os.path.exists(model_path):
41
+ # Load pre-trained classifier from pickle
42
+ with open(model_path, 'rb') as f:
43
+ classifier = pickle.load(f)
44
+ else:
45
+ # Train logistic regression model
46
+ X_train_embeddings = embedding_model.encode(X_train.tolist(), convert_to_tensor=True).cpu().numpy()
47
+ classifier = LogisticRegression(max_iter=1000)
48
+ classifier.fit(X_train_embeddings, y_train)
49
+
50
+ # Save the trained model to pickle
51
+ with open(model_path, 'wb') as f:
52
+ pickle.dump(classifier, f)
53
+
54
+ # Evaluate model (optional, for logging purposes)
55
+ X_test_embeddings = embedding_model.encode(X_test.tolist(), convert_to_tensor=True).cpu().numpy()
56
+ y_pred = classifier.predict(X_test_embeddings)
57
+ accuracy = accuracy_score(y_test, y_pred)
58
+ print(f"Model Accuracy: {accuracy:.4f}")
59
+ print(classification_report(y_test, y_pred))
60
+
61
+ # Home route
62
+ @app.get("/")
63
+ async def home():
64
+ return {"message": "Welcome to the embedding, similarity, and SMS classification API. Use /docs to test the endpoints."}
65
+
66
+ # Endpoint for generating embeddings
67
+ @app.post("/embed")
68
+ async def generate_embedding(text_input: TextInput):
69
+ try:
70
+ embedding = embedding_model.encode(text_input.text, convert_to_tensor=True).cpu().numpy()
71
+ rounded_embedding = np.round(embedding, decimals=2).tolist()
72
+ dimensions = len(rounded_embedding)
73
+ return {"dimensions": dimensions, "embeddings": rounded_embedding}
74
+ except Exception as e:
75
+ raise HTTPException(status_code=500, detail=str(e))
76
+
77
+ # Endpoint for calculating cosine similarity
78
+ @app.post("/similarity")
79
+ async def calculate_similarity(similarity_input: SimilarityInput):
80
+ try:
81
+ embeddings1 = similarity_model.encode(similarity_input.text1, convert_to_tensor=True)
82
+ embeddings2 = similarity_model.encode(similarity_input.text2, convert_to_tensor=True)
83
+ cosine_similarity = util.cos_sim(embeddings1, embeddings2).item()
84
+ return {"text1": similarity_input.text1, "text2": similarity_input.text2, "cosine_similarity": round(cosine_similarity, 4)}
85
+ except Exception as e:
86
+ raise HTTPException(status_code=500, detail=str(e))
87
+
88
+ # Endpoint for SMS classification
89
+ @app.post("/classify_sms")
90
+ async def classify_sms(sms_input: SMSInput):
91
+ try:
92
+ # Encode the input SMS
93
+ sms_embedding = embedding_model.encode(sms_input.sms, convert_to_tensor=True).cpu().numpy()
94
+
95
+ # Predict the label using the trained model
96
+ prediction = classifier.predict([sms_embedding])
97
+
98
+ # Return the predicted label
99
+ return {"sms": sms_input.sms, "classification": prediction[0]}
100
+ except Exception as e:
101
+ raise HTTPException(status_code=500, detail=str(e))
102
+
103
+ # Run FastAPI app
104
+ if __name__ == "__main__":
105
+ import uvicorn
106
+ uvicorn.run(app, host="0.0.0.0", port=7860)
app/requirements.txt ADDED
File without changes
data/sms_process_data_main.xlsx ADDED
Binary file (46.8 kB). View file
 
models/gte-base-en-v1.5.pickle ADDED
Binary file (6.87 kB). View file
 
models/train_modle.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from sklearn.linear_model import LogisticRegression
3
+ from sklearn.model_selection import train_test_split
4
+ import pandas as pd
5
+ import pickle
6
+
7
+ # Load dataset
8
+ df = pd.read_excel("data/sms_process_data_main.xlsx", sheet_name="Sheet1")
9
+ X_train, _, y_train, _ = train_test_split(df['MessageText'], df['label'], test_size=0.2, random_state=42)
10
+
11
+ # Load embedding model
12
+ embedding_model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)
13
+
14
+ # Generate embeddings
15
+ X_train_embeddings = embedding_model.encode(X_train.tolist(), convert_to_tensor=True).cpu().numpy()
16
+
17
+ # Train model
18
+ classifier = LogisticRegression(max_iter=1000)
19
+ classifier.fit(X_train_embeddings, y_train)
20
+
21
+ # Save to pickle
22
+ with open("models/gte-base-en-v1.5.pickle", "wb") as f:
23
+ pickle.dump(classifier, f)
24
+
25
+ print("Model saved to gte-base-en-v1.5.pickle")