Spaces:

DvorakInnovationAI
/

GenAI-FASTAPI

Sleeping

File size: 4,069 Bytes

import re
import numpy as np
import faiss
from src.genai.utils.models_loader import embedding_model , encoding_model
from src.genai.utils.utils import clean_text
from src.genai.utils.data_loader import api_knowledge_df, api_index, caption_df , caption_index
from src.genai.utils.models_loader import embedding_model
import pandas as pd

class APIKnowledgeRetrieveTool:
    def __init__(self):
        self.df = api_knowledge_df
        self.index = api_index

    def retrieve(self,query):
        query_embedding = np.array(embedding_model.embed_query(query)).reshape(1, -1).astype('float32')
        distances, indices = self.index.search(query_embedding, 1)
        row=self.df.iloc[indices[0]]
        data = {'endpoint':row['endpoint'],
                'method':row['method'],
                'parameters':row['parameters']}
        return data
    
class RetrieverBackup:
    def __init__(self):
        self.df = caption_df
        self.index = caption_index

    def _filter_dataset(self, query):
        usernames = self.df["username"].dropna().unique()
        matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)]
        if matched_users:
            filtered_df = self.df[self.df["username"].isin(matched_users)]
            return filtered_df
        else:
            return self.df


    def retrieve_old(self, query):
        query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32')
        print('Embeddings Generated')
        faiss.normalize_L2(query_embedding)
        print('Query embedded')
        filtered_df = self._filter_dataset(query)
        distances, indices = self.index.search(query_embedding, len(filtered_df))
        similarity_threshold = 0.35
        selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
        if not selected:
            return "No influencers found."

        outer_list = []
        for rank, (idx, sim) in enumerate(selected, 1):
            row = filtered_df.iloc[idx]
            inner_list = [
                f"[{rank}]. The influencer name is: **{row['username']}** — Likes: **{row['likesCount']}**, Comments: **{row['commentCount']}**",
                f"The branding or promotion done is:\n{row['visible_texts_or_brandings']}",
                f"The details of product or service is:\n{row['product_or_service_details']}"
            ]
            outer_list.append(inner_list)

        cleaned_response = clean_text(str(outer_list))
        print('response cleaned')
        tokens = encoding_model.encode(cleaned_response)[:500]
        print('tokens got')
        return encoding_model.decode(tokens)
    
    def retrieve(self, query):
        query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32')
        faiss.normalize_L2(query_embedding)

        # Search on full dataset (index is built on full df)
        distances, indices = self.index.search(query_embedding, len(self.df))
        similarity_threshold = 0.35

        # Prepare matched usernames
        usernames = self.df["username"].dropna().unique()
        matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)]

        results = []
        rank = 1
        for idx, sim in zip(indices[0], distances[0]):
            if sim < similarity_threshold:
                continue
            
            row = self.df.iloc[idx]

            # If query mentions usernames, only keep those rows
            if matched_users and row["username"] not in matched_users:
                continue

            results.append({
                'url': row['videoUrl'],
                'username': row['username'],
                'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None,
                'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None
            })
        results = results[:10] if len(results) > 10 else results
        return results