Spaces:

Noor22Tak
/

First_rec

Sleeping

File size: 5,209 Bytes


import base64
import traceback
import faiss
from fastapi import FastAPI, HTTPException
import requests
from pydantic import BaseModel
import numpy as np
import pandas as pd
import os

# Initialize FastAPI app
app = FastAPI()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")  # Load from environment variable

# Hugging Face API details
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"
HEADERS = {
    "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
    "Content-Type": "application/json; charset=UTF-8",
    }

# Store embeddings globally (in-memory storage)
global_embedding = None  
index = faiss.read_index("news_index.faissFF")


@app.get('/')
def home():
    return {"Message": "Hello"}

# ReCreate the index file with 384 embedding -------------------------------------------------

# Define the correct dimension
# embedding_dim = 384

# # Create a new FAISS index with L2 distance
# new_index = faiss.IndexFlatL2(embedding_dim)

# # Extract only the first 384 dimensions from the old 4096D vectors
# stored_vectors = index.reconstruct_n(0, index.ntotal)  # Get all stored vectors
# stored_vectors_384 = stored_vectors[:, :embedding_dim]  # Keep only first 384D

# # Add them to the new FAISS index
# new_index.add(stored_vectors_384)


# faiss.write_index(new_index, "faiss_index_384D.index")
# -----------------------------------------------

# Request model for input validation
class EmbeddingRequest(BaseModel):
    text: str

# Function to get embedding from Hugging Face API
def get_embedding(text: str):
    try:
        response = requests.post(API_URL, headers=HEADERS, json={"inputs": text})

        if response.status_code != 200:
            raise HTTPException(status_code=response.status_code, detail=response.json())

        return response.json()

    except requests.RequestException as e:
        raise HTTPException(status_code=500, detail=str(e))



print(f"FAISS index size: {index.ntotal}")  # Total stored vectors

news_df = pd.read_csv("news_dataset.csv")  # Ensure this file is in the correct directory





@app.post("/get_Emd_Corrected")
async def generate_embedding(request: EmbeddingRequest):
    try:
        embedding = np.array(get_embedding(request.text), dtype="float32")

        if embedding.shape[0] != 384:
            return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"}

        embedding_query = embedding.reshape(1, -1)  # Keep it 384D

        if index is None:
            return {"error": "FAISS index not loaded"}

        k = 10
        distances, indices = index.search(embedding_query, k)

        # Retrieve news articles based on indices
        results = []
        for i, idx in enumerate(indices[0]):  # Iterate over retrieved indices
            if idx < len(news_df):  # Ensure index is within bounds
                article = news_df.iloc[idx].to_dict()
                article["distance"] = float(distances[0][i])  # Add similarity score
                results.append(article)

        
        return {
            "embedding": embedding.tolist(),
            "Distances": distances.tolist(),
            "Indices": indices.tolist(),
            "results": results
        }

    except Exception as e:
        return {"error": str(e), "traceback": traceback.format_exc()}

import re

def clean_arabic_text(text):
    """Removes invalid characters that cause JSON decoding errors"""
    text = re.sub(r"[\x00-\x1F\x7F\u202c\ufeff]", "", text)  # Remove hidden control characters
    return text.strip()


@app.post("/get_Emd_Data")
async def generate_embedding(request: EmbeddingRequest):
    try:
        request.text = clean_arabic_text(request.text)
        encoded_text = base64.b64encode(request.text.encode()).decode()  # Encode text in Base64

        # Get the embedding
        embedding = np.array(get_embedding(encoded_text), dtype="float32")

        if embedding.shape[0] != 384:
            return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"}

        # Ensure it's 384D
        embedding_query = embedding.reshape(1, -1)

        # Check if FAISS index is loaded
        if index is None:
            return {"error": "FAISS index not loaded"}

        # Search FAISS index
        k = 10  # Number of nearest neighbors
        distances, indices = index.search(embedding_query, k)

        # Retrieve news articles based on indices
        results = []
        for i, idx in enumerate(indices[0]):  # Iterate over retrieved indices
            if idx < len(news_df):  # Ensure index is within bounds
                article = news_df.iloc[idx].to_dict()
                article["distance"] = float(distances[0][i])  # Add similarity score
                results.append(article)

        return {"results": results}

    except Exception as e:
        return {"error": str(e), "traceback": traceback.format_exc()}



# FastAPI endpoint to retrieve the last stored embedding
@app.get("/last-embedding")
async def get_last_embedding():
    if global_embedding is None:
        raise HTTPException(status_code=404, detail="No embedding stored yet")
    return {"last_embedding": global_embedding}