| |
|
| | import base64 |
| | import traceback |
| | import faiss |
| | from fastapi import FastAPI, HTTPException |
| | import requests |
| | from pydantic import BaseModel |
| | import numpy as np |
| | import pandas as pd |
| | import os |
| |
|
| | |
| | app = FastAPI() |
| | HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") |
| |
|
| | |
| | API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2" |
| | HEADERS = { |
| | "Authorization": f"Bearer {HUGGINGFACE_API_KEY}", |
| | "Content-Type": "application/json; charset=UTF-8", |
| | } |
| |
|
| | |
| | global_embedding = None |
| | index = faiss.read_index("news_index.faissFF") |
| |
|
| |
|
| | @app.get('/') |
| | def home(): |
| | return {"Message": "Hello"} |
| |
|
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| |
|
| | |
| | |
| |
|
| | |
| | class EmbeddingRequest(BaseModel): |
| | text: str |
| |
|
| | |
| | def get_embedding(text: str): |
| | try: |
| | response = requests.post(API_URL, headers=HEADERS, json={"inputs": text}) |
| |
|
| | if response.status_code != 200: |
| | raise HTTPException(status_code=response.status_code, detail=response.json()) |
| |
|
| | return response.json() |
| |
|
| | except requests.RequestException as e: |
| | raise HTTPException(status_code=500, detail=str(e)) |
| |
|
| |
|
| |
|
| | print(f"FAISS index size: {index.ntotal}") |
| |
|
| | news_df = pd.read_csv("news_dataset.csv") |
| |
|
| |
|
| |
|
| |
|
| |
|
| | @app.post("/get_Emd_Corrected") |
| | async def generate_embedding(request: EmbeddingRequest): |
| | try: |
| | embedding = np.array(get_embedding(request.text), dtype="float32") |
| |
|
| | if embedding.shape[0] != 384: |
| | return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"} |
| |
|
| | embedding_query = embedding.reshape(1, -1) |
| |
|
| | if index is None: |
| | return {"error": "FAISS index not loaded"} |
| |
|
| | k = 10 |
| | distances, indices = index.search(embedding_query, k) |
| |
|
| | |
| | results = [] |
| | for i, idx in enumerate(indices[0]): |
| | if idx < len(news_df): |
| | article = news_df.iloc[idx].to_dict() |
| | article["distance"] = float(distances[0][i]) |
| | results.append(article) |
| |
|
| | |
| | return { |
| | "embedding": embedding.tolist(), |
| | "Distances": distances.tolist(), |
| | "Indices": indices.tolist(), |
| | "results": results |
| | } |
| |
|
| | except Exception as e: |
| | return {"error": str(e), "traceback": traceback.format_exc()} |
| |
|
| | import re |
| |
|
| | def clean_arabic_text(text): |
| | """Removes invalid characters that cause JSON decoding errors""" |
| | text = re.sub(r"[\x00-\x1F\x7F\u202c\ufeff]", "", text) |
| | return text.strip() |
| |
|
| |
|
| | @app.post("/get_Emd_Data") |
| | async def generate_embedding(request: EmbeddingRequest): |
| | try: |
| | request.text = clean_arabic_text(request.text) |
| | encoded_text = base64.b64encode(request.text.encode()).decode() |
| |
|
| | |
| | embedding = np.array(get_embedding(encoded_text), dtype="float32") |
| |
|
| | if embedding.shape[0] != 384: |
| | return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"} |
| |
|
| | |
| | embedding_query = embedding.reshape(1, -1) |
| |
|
| | |
| | if index is None: |
| | return {"error": "FAISS index not loaded"} |
| |
|
| | |
| | k = 10 |
| | distances, indices = index.search(embedding_query, k) |
| |
|
| | |
| | results = [] |
| | for i, idx in enumerate(indices[0]): |
| | if idx < len(news_df): |
| | article = news_df.iloc[idx].to_dict() |
| | article["distance"] = float(distances[0][i]) |
| | results.append(article) |
| |
|
| | return {"results": results} |
| |
|
| | except Exception as e: |
| | return {"error": str(e), "traceback": traceback.format_exc()} |
| |
|
| |
|
| |
|
| | |
| | @app.get("/last-embedding") |
| | async def get_last_embedding(): |
| | if global_embedding is None: |
| | raise HTTPException(status_code=404, detail="No embedding stored yet") |
| | return {"last_embedding": global_embedding} |
| |
|
| |
|
| |
|