velmurugan1122 commited on
Commit
ad471a0
·
1 Parent(s): 441bbc4

cousin method

Browse files
back_end/data/sms_process_data_main.xlsx CHANGED
Binary files a/back_end/data/sms_process_data_main.xlsx and b/back_end/data/sms_process_data_main.xlsx differ
 
back_end/models/embedding_model.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from sentence_transformers import SentenceTransformer
2
 
3
  # Load the pre-trained embedding model
@@ -5,5 +6,5 @@ model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=Tr
5
 
6
  def generate_embedding(text: str):
7
  """Generate a 768-dimensional embedding for the input text."""
8
- embedding = model.encode(text).tolist() # Convert NumPy array to list
9
  return embedding
 
1
+ # back_end/models/embedding_model.py
2
  from sentence_transformers import SentenceTransformer
3
 
4
  # Load the pre-trained embedding model
 
6
 
7
  def generate_embedding(text: str):
8
  """Generate a 768-dimensional embedding for the input text."""
9
+ embedding = model.encode(text).tolist() # Convert NumPy array to list
10
  return embedding
back_end/models/logistic.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:380a06dab235d33a0ef39e31efc30e00acb036c3630e631069297d05f0844092
3
+ size 6874
back_end/routers/embedding.py CHANGED
@@ -1,14 +1,59 @@
1
  from fastapi import APIRouter, HTTPException
 
 
 
2
  from back_end.models.embedding_model import generate_embedding
3
  from back_end.schemas.request import TextRequest
 
 
4
 
5
  router = APIRouter()
6
 
7
- @router.get("/generate_embedding/")
8
- def get_embedding(text: str):
 
 
 
 
 
 
 
 
 
 
 
 
9
  """Returns a 768-dimensional embedding for the given text."""
10
- if not text:
11
  raise HTTPException(status_code=400, detail="Text cannot be empty")
12
 
13
- embedding = generate_embedding(text)
14
  return {"dimensions": len(embedding), "embedding": embedding}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import APIRouter, HTTPException
2
+ import os
3
+ import pickle
4
+
5
  from back_end.models.embedding_model import generate_embedding
6
  from back_end.schemas.request import TextRequest
7
+ from sklearn.linear_model import LogisticRegression
8
+ from scipy.spatial.distance import cosine
9
 
10
  router = APIRouter()
11
 
12
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # Get the directory of the current file
13
+ MODEL_PATH = os.path.join(BASE_DIR, "..", "models", "logistic.pkl")
14
+
15
+ try:
16
+ with open(MODEL_PATH, "rb") as f:
17
+ logistic_model = pickle.load(f)
18
+ except FileNotFoundError:
19
+ raise RuntimeError(f"Model file not found at {MODEL_PATH}")
20
+ except pickle.UnpicklingError:
21
+ raise RuntimeError(f"Error unpickling model file at {MODEL_PATH}")
22
+
23
+
24
+ @router.post("/generate_embedding/")
25
+ def get_embedding(request: TextRequest):
26
  """Returns a 768-dimensional embedding for the given text."""
27
+ if not request.text:
28
  raise HTTPException(status_code=400, detail="Text cannot be empty")
29
 
30
+ embedding = generate_embedding(request.text)
31
  return {"dimensions": len(embedding), "embedding": embedding}
32
+
33
+
34
+ @router.post("/cosine_similarity/")
35
+ def get_cosine_similarity(request: TextRequest):
36
+ """Returns the cosine similarity between two input texts."""
37
+ if not hasattr(request, 'text') or not hasattr(request, 'text2'):
38
+ raise HTTPException(status_code=400, detail="Both text inputs must be provided")
39
+
40
+ embedding1 = generate_embedding(request.text)
41
+ embedding2 = generate_embedding(request.text2)
42
+
43
+ similarity = 1 - cosine(embedding1, embedding2)
44
+ return {"cosine_similarity": similarity}
45
+
46
+
47
+ @router.post("/logistic_prediction/")
48
+ def get_logistic_prediction(request: TextRequest):
49
+ """Returns the prediction from the logistic regression model for the input text."""
50
+ if not request.text:
51
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
52
+
53
+ embedding = generate_embedding(request.text)
54
+ try:
55
+ prediction = logistic_model.predict([embedding])[0]
56
+ except Exception as e:
57
+ raise HTTPException(status_code=500, detail=f"Model prediction failed: {str(e)}")
58
+
59
+ return {"prediction": prediction}
back_end/schemas/request.py CHANGED
@@ -2,3 +2,6 @@ from pydantic import BaseModel
2
 
3
  class TextRequest(BaseModel):
4
  text: str
 
 
 
 
2
 
3
  class TextRequest(BaseModel):
4
  text: str
5
+ text2: str = None # Optional for cosine similarity
6
+
7
+
back_end/service/train_model.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import accuracy_score
8
+
9
+ # Define paths
10
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
11
+ DATA_PATH = "data/sms_process_data_main.xlsx"
12
+ MODEL_PATH = "models/logistic.pkl"
13
+
14
+ # Check if the dataset file exists
15
+ print(DATA_PATH)
16
+ if not os.path.exists(DATA_PATH):
17
+ raise FileNotFoundError(f"Dataset file not found at: {DATA_PATH}")
18
+
19
+ # Load dataset
20
+ df = pd.read_excel(DATA_PATH)
21
+
22
+ # Ensure the dataset has the required columns (adjust as necessary)
23
+ if not {'text', 'label'}.issubset(df.columns):
24
+ raise ValueError("Dataset must contain 'text' and 'label' columns")
25
+
26
+ # Load Sentence Transformer model
27
+ embedding_model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
28
+
29
+ # Generate embeddings
30
+ X = df['text'].apply(lambda x: embedding_model.encode(x).tolist()).tolist()
31
+ y = df['label']
32
+
33
+ # Train/test split
34
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
35
+
36
+ # Train Logistic Regression model
37
+ logistic_model = LogisticRegression(max_iter=1000)
38
+ logistic_model.fit(X_train, y_train)
39
+
40
+ # Evaluate the model
41
+ y_pred = logistic_model.predict(X_test)
42
+ accuracy = accuracy_score(y_test, y_pred)
43
+ print(f"Model Accuracy: {accuracy:.4f}")
44
+
45
+ # Save the model
46
+ print("Saving model and embeddings...")
47
+ with open(MODEL_PATH, 'wb') as f:
48
+ pickle.dump(logistic_model, f)
49
+
50
+ print(f"Logistic model saved to {MODEL_PATH}")