Spaces:

Noor22Tak
/

First_rec

Sleeping

App Files Files Community

Noor22Tak commited on Mar 30, 2025

Commit

06ee9db

verified ·

1 Parent(s): 2c5af17

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -43

app.py CHANGED Viewed

@@ -1,62 +1,166 @@
-from fastapi import FastAPI
-from pydantic import BaseModel
-import pandas as pd
-import numpy as np
 import faiss
 import requests
 import os
 app = FastAPI()
-# Load dataset
-df = pd.read_csv("news_dataset.csv")
-HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")  # Load from environment variable
-# Load FAISS index
-index = faiss.read_index("arabic_news_index")
-# Define request model
-class NewsQuery(BaseModel):
-    prompt: str
-@app.get('/test')
-def test():
-    return {"Message" : "This is work"}
-def create_textual_representation(row):
-    """Convert a news article into a structured text representation."""
-    return f"""
-    الكاتب: {row['writer']},
-    الموقع: {row['location']},
-    التاريخ: {row['date']},
-    الوقت: {row['time']},
-    العنوان: {row['title']},
-    الخبر: {row['news']}
-    """
-@app.post("/recommend")
-async def recommend_articles(query: NewsQuery):
-    """Find similar news articles using FAISS with real Llama 3.1 embeddings."""
-    # Call Llama 3.1 remotely for embeddings
-    res = requests.post("https://api-inference.huggingface.co/models/meta-llama/Llama-3.1-8B",
-                        headers={"Authorization": HUGGINGFACE_API_KEY},
-                        json={"inputs": query.prompt})
-    if res.status_code != 200:
-        return {"error": "Failed to get embeddings from Llama 3.1"}
-    # Extract the real embedding
-    embedding = np.array([res.json()[0]['embedding']], dtype="float32")
-    # Search FAISS index for similar articles
-    D, I = index.search(embedding, 5)
-    # Retrieve recommended articles
-    recommendations = df.iloc[I.flatten()][['title', 'writer', 'news']].to_dict(orient="records")
-    return {"recommendations": recommendations}

+import base64
+import traceback
 import faiss
+from fastapi import FastAPI, HTTPException
 import requests
+from pydantic import BaseModel
+import numpy as np
+import pandas as pd
 import os
+# Initialize FastAPI app
 app = FastAPI()
+HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")  # Load from environment variable
+# Hugging Face API details
+API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"
+HEADERS = {
+    "Authorization": "Bearer HUGGINGFACE_API_KEY",
+    "Content-Type": "application/json; charset=UTF-8",
+    }
+# Store embeddings globally (in-memory storage)
+global_embedding = None
+index = faiss.read_index("news_index.faissFF")
+@app.get('/')
+def home():
+    return {"Message": "Hello"}
+# ReCreate the index file with 384 embedding -------------------------------------------------
+# Define the correct dimension
+# embedding_dim = 384
+# # Create a new FAISS index with L2 distance
+# new_index = faiss.IndexFlatL2(embedding_dim)
+# # Extract only the first 384 dimensions from the old 4096D vectors
+# stored_vectors = index.reconstruct_n(0, index.ntotal)  # Get all stored vectors
+# stored_vectors_384 = stored_vectors[:, :embedding_dim]  # Keep only first 384D
+# # Add them to the new FAISS index
+# new_index.add(stored_vectors_384)
+# faiss.write_index(new_index, "faiss_index_384D.index")
+# -----------------------------------------------
+# Request model for input validation
+class EmbeddingRequest(BaseModel):
+    text: str
+# Function to get embedding from Hugging Face API
+def get_embedding(text: str):
+    try:
+        response = requests.post(API_URL, headers=HEADERS, json={"inputs": text})
+        if response.status_code != 200:
+            raise HTTPException(status_code=response.status_code, detail=response.json())
+        return response.json()
+    except requests.RequestException as e:
+        raise HTTPException(status_code=500, detail=str(e))
+print(f"FAISS index size: {index.ntotal}")  # Total stored vectors
+news_df = pd.read_csv("news_dataset.csv")  # Ensure this file is in the correct directory
+@app.post("/get_Emd_Corrected")
+async def generate_embedding(request: EmbeddingRequest):
+    try:
+        embedding = np.array(get_embedding(request.text), dtype="float32")
+        if embedding.shape[0] != 384:
+            return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"}
+        embedding_query = embedding.reshape(1, -1)  # Keep it 384D
+        if index is None:
+            return {"error": "FAISS index not loaded"}
+        k = 10
+        distances, indices = index.search(embedding_query, k)
+        # Retrieve news articles based on indices
+        results = []
+        for i, idx in enumerate(indices[0]):  # Iterate over retrieved indices
+            if idx < len(news_df):  # Ensure index is within bounds
+                article = news_df.iloc[idx].to_dict()
+                article["distance"] = float(distances[0][i])  # Add similarity score
+                results.append(article)
+        return {
+            "embedding": embedding.tolist(),
+            "Distances": distances.tolist(),
+            "Indices": indices.tolist(),
+            "results": results
+        }
+    except Exception as e:
+        return {"error": str(e), "traceback": traceback.format_exc()}
+import re
+def clean_arabic_text(text):
+    """Removes invalid characters that cause JSON decoding errors"""
+    text = re.sub(r"[\x00-\x1F\x7F\u202c\ufeff]", "", text)  # Remove hidden control characters
+    return text.strip()
+@app.post("/get_Emd_Data")
+async def generate_embedding(request: EmbeddingRequest):
+    try:
+        request.text = clean_arabic_text(request.text)
+        encoded_text = base64.b64encode(request.text.encode()).decode()  # Encode text in Base64
+        # Get the embedding
+        embedding = np.array(get_embedding(encoded_text), dtype="float32")
+        if embedding.shape[0] != 384:
+            return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"}
+        # Ensure it's 384D
+        embedding_query = embedding.reshape(1, -1)
+        # Check if FAISS index is loaded
+        if index is None:
+            return {"error": "FAISS index not loaded"}
+        # Search FAISS index
+        k = 10  # Number of nearest neighbors
+        distances, indices = index.search(embedding_query, k)
+        # Retrieve news articles based on indices
+        results = []
+        for i, idx in enumerate(indices[0]):  # Iterate over retrieved indices
+            if idx < len(news_df):  # Ensure index is within bounds
+                article = news_df.iloc[idx].to_dict()
+                article["distance"] = float(distances[0][i])  # Add similarity score
+                results.append(article)
+        return {"results": results}
+    except Exception as e:
+        return {"error": str(e), "traceback": traceback.format_exc()}
+# FastAPI endpoint to retrieve the last stored embedding
+@app.get("/last-embedding")
+async def get_last_embedding():
+    if global_embedding is None:
+        raise HTTPException(status_code=404, detail="No embedding stored yet")
+    return {"last_embedding": global_embedding}