Noor22Tak commited on
Commit
06ee9db
·
verified ·
1 Parent(s): 2c5af17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -43
app.py CHANGED
@@ -1,62 +1,166 @@
1
- from fastapi import FastAPI
2
- from pydantic import BaseModel
3
- import pandas as pd
4
- import numpy as np
5
  import faiss
 
6
  import requests
 
 
 
7
  import os
8
 
 
9
  app = FastAPI()
 
10
 
11
- # Load dataset
12
- df = pd.read_csv("news_dataset.csv")
 
 
 
 
13
 
14
- HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") # Load from environment variable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Load FAISS index
17
- index = faiss.read_index("arabic_news_index")
18
 
19
- # Define request model
20
- class NewsQuery(BaseModel):
21
- prompt: str
22
 
 
 
 
23
 
 
 
 
24
 
25
- @app.get('/test')
26
- def test():
27
- return {"Message" : "This is work"}
 
 
 
 
28
 
 
29
 
30
- def create_textual_representation(row):
31
- """Convert a news article into a structured text representation."""
32
- return f"""
33
- الكاتب: {row['writer']},
34
- الموقع: {row['location']},
35
- التاريخ: {row['date']},
36
- الوقت: {row['time']},
37
- العنوان: {row['title']},
38
- الخبر: {row['news']}
39
- """
40
 
41
- @app.post("/recommend")
42
- async def recommend_articles(query: NewsQuery):
43
- """Find similar news articles using FAISS with real Llama 3.1 embeddings."""
44
-
45
- # Call Llama 3.1 remotely for embeddings
46
- res = requests.post("https://api-inference.huggingface.co/models/meta-llama/Llama-3.1-8B",
47
- headers={"Authorization": HUGGINGFACE_API_KEY},
48
- json={"inputs": query.prompt})
49
-
50
- if res.status_code != 200:
51
- return {"error": "Failed to get embeddings from Llama 3.1"}
52
 
53
- # Extract the real embedding
54
- embedding = np.array([res.json()[0]['embedding']], dtype="float32")
55
 
56
- # Search FAISS index for similar articles
57
- D, I = index.search(embedding, 5)
 
 
 
 
58
 
59
- # Retrieve recommended articles
60
- recommendations = df.iloc[I.flatten()][['title', 'writer', 'news']].to_dict(orient="records")
61
 
62
- return {"recommendations": recommendations}
 
1
+
2
+ import base64
3
+ import traceback
 
4
  import faiss
5
+ from fastapi import FastAPI, HTTPException
6
  import requests
7
+ from pydantic import BaseModel
8
+ import numpy as np
9
+ import pandas as pd
10
  import os
11
 
12
+ # Initialize FastAPI app
13
  app = FastAPI()
14
+ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") # Load from environment variable
15
 
16
+ # Hugging Face API details
17
+ API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"
18
+ HEADERS = {
19
+ "Authorization": "Bearer HUGGINGFACE_API_KEY",
20
+ "Content-Type": "application/json; charset=UTF-8",
21
+ }
22
 
23
+ # Store embeddings globally (in-memory storage)
24
+ global_embedding = None
25
+ index = faiss.read_index("news_index.faissFF")
26
+
27
+
28
+ @app.get('/')
29
+ def home():
30
+ return {"Message": "Hello"}
31
+
32
+ # ReCreate the index file with 384 embedding -------------------------------------------------
33
+
34
+ # Define the correct dimension
35
+ # embedding_dim = 384
36
+
37
+ # # Create a new FAISS index with L2 distance
38
+ # new_index = faiss.IndexFlatL2(embedding_dim)
39
+
40
+ # # Extract only the first 384 dimensions from the old 4096D vectors
41
+ # stored_vectors = index.reconstruct_n(0, index.ntotal) # Get all stored vectors
42
+ # stored_vectors_384 = stored_vectors[:, :embedding_dim] # Keep only first 384D
43
+
44
+ # # Add them to the new FAISS index
45
+ # new_index.add(stored_vectors_384)
46
+
47
+
48
+ # faiss.write_index(new_index, "faiss_index_384D.index")
49
+ # -----------------------------------------------
50
+
51
+ # Request model for input validation
52
+ class EmbeddingRequest(BaseModel):
53
+ text: str
54
+
55
+ # Function to get embedding from Hugging Face API
56
+ def get_embedding(text: str):
57
+ try:
58
+ response = requests.post(API_URL, headers=HEADERS, json={"inputs": text})
59
+
60
+ if response.status_code != 200:
61
+ raise HTTPException(status_code=response.status_code, detail=response.json())
62
+
63
+ return response.json()
64
+
65
+ except requests.RequestException as e:
66
+ raise HTTPException(status_code=500, detail=str(e))
67
+
68
+
69
+
70
+ print(f"FAISS index size: {index.ntotal}") # Total stored vectors
71
+
72
+ news_df = pd.read_csv("news_dataset.csv") # Ensure this file is in the correct directory
73
+
74
+
75
+
76
+
77
+
78
+ @app.post("/get_Emd_Corrected")
79
+ async def generate_embedding(request: EmbeddingRequest):
80
+ try:
81
+ embedding = np.array(get_embedding(request.text), dtype="float32")
82
+
83
+ if embedding.shape[0] != 384:
84
+ return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"}
85
+
86
+ embedding_query = embedding.reshape(1, -1) # Keep it 384D
87
+
88
+ if index is None:
89
+ return {"error": "FAISS index not loaded"}
90
+
91
+ k = 10
92
+ distances, indices = index.search(embedding_query, k)
93
+
94
+ # Retrieve news articles based on indices
95
+ results = []
96
+ for i, idx in enumerate(indices[0]): # Iterate over retrieved indices
97
+ if idx < len(news_df): # Ensure index is within bounds
98
+ article = news_df.iloc[idx].to_dict()
99
+ article["distance"] = float(distances[0][i]) # Add similarity score
100
+ results.append(article)
101
+
102
+
103
+ return {
104
+ "embedding": embedding.tolist(),
105
+ "Distances": distances.tolist(),
106
+ "Indices": indices.tolist(),
107
+ "results": results
108
+ }
109
+
110
+ except Exception as e:
111
+ return {"error": str(e), "traceback": traceback.format_exc()}
112
+
113
+ import re
114
+
115
+ def clean_arabic_text(text):
116
+ """Removes invalid characters that cause JSON decoding errors"""
117
+ text = re.sub(r"[\x00-\x1F\x7F\u202c\ufeff]", "", text) # Remove hidden control characters
118
+ return text.strip()
119
+
120
+
121
+ @app.post("/get_Emd_Data")
122
+ async def generate_embedding(request: EmbeddingRequest):
123
+ try:
124
+ request.text = clean_arabic_text(request.text)
125
+ encoded_text = base64.b64encode(request.text.encode()).decode() # Encode text in Base64
126
+
127
+ # Get the embedding
128
+ embedding = np.array(get_embedding(encoded_text), dtype="float32")
129
 
130
+ if embedding.shape[0] != 384:
131
+ return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"}
132
 
133
+ # Ensure it's 384D
134
+ embedding_query = embedding.reshape(1, -1)
 
135
 
136
+ # Check if FAISS index is loaded
137
+ if index is None:
138
+ return {"error": "FAISS index not loaded"}
139
 
140
+ # Search FAISS index
141
+ k = 10 # Number of nearest neighbors
142
+ distances, indices = index.search(embedding_query, k)
143
 
144
+ # Retrieve news articles based on indices
145
+ results = []
146
+ for i, idx in enumerate(indices[0]): # Iterate over retrieved indices
147
+ if idx < len(news_df): # Ensure index is within bounds
148
+ article = news_df.iloc[idx].to_dict()
149
+ article["distance"] = float(distances[0][i]) # Add similarity score
150
+ results.append(article)
151
 
152
+ return {"results": results}
153
 
154
+ except Exception as e:
155
+ return {"error": str(e), "traceback": traceback.format_exc()}
 
 
 
 
 
 
 
 
156
 
 
 
 
 
 
 
 
 
 
 
 
157
 
 
 
158
 
159
+ # FastAPI endpoint to retrieve the last stored embedding
160
+ @app.get("/last-embedding")
161
+ async def get_last_embedding():
162
+ if global_embedding is None:
163
+ raise HTTPException(status_code=404, detail="No embedding stored yet")
164
+ return {"last_embedding": global_embedding}
165
 
 
 
166