import re import numpy as np import faiss from src.genai.utils.models_loader import embedding_model , encoding_model from src.genai.utils.utils import clean_text from src.genai.utils.data_loader import api_knowledge_df, api_index, caption_df , caption_index from src.genai.utils.models_loader import embedding_model import pandas as pd class APIKnowledgeRetrieveTool: def __init__(self): self.df = api_knowledge_df self.index = api_index def retrieve(self,query): query_embedding = np.array(embedding_model.embed_query(query)).reshape(1, -1).astype('float32') distances, indices = self.index.search(query_embedding, 1) row=self.df.iloc[indices[0]] data = {'endpoint':row['endpoint'], 'method':row['method'], 'parameters':row['parameters']} return data class RetrieverBackup: def __init__(self): self.df = caption_df self.index = caption_index def _filter_dataset(self, query): usernames = self.df["username"].dropna().unique() matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)] if matched_users: filtered_df = self.df[self.df["username"].isin(matched_users)] return filtered_df else: return self.df def retrieve_old(self, query): query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32') print('Embeddings Generated') faiss.normalize_L2(query_embedding) print('Query embedded') filtered_df = self._filter_dataset(query) distances, indices = self.index.search(query_embedding, len(filtered_df)) similarity_threshold = 0.35 selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold] if not selected: return "No influencers found." outer_list = [] for rank, (idx, sim) in enumerate(selected, 1): row = filtered_df.iloc[idx] inner_list = [ f"[{rank}]. The influencer name is: **{row['username']}** — Likes: **{row['likesCount']}**, Comments: **{row['commentCount']}**", f"The branding or promotion done is:\n{row['visible_texts_or_brandings']}", f"The details of product or service is:\n{row['product_or_service_details']}" ] outer_list.append(inner_list) cleaned_response = clean_text(str(outer_list)) print('response cleaned') tokens = encoding_model.encode(cleaned_response)[:500] print('tokens got') return encoding_model.decode(tokens) def retrieve(self, query): query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32') faiss.normalize_L2(query_embedding) # Search on full dataset (index is built on full df) distances, indices = self.index.search(query_embedding, len(self.df)) similarity_threshold = 0.35 # Prepare matched usernames usernames = self.df["username"].dropna().unique() matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)] results = [] rank = 1 for idx, sim in zip(indices[0], distances[0]): if sim < similarity_threshold: continue row = self.df.iloc[idx] # If query mentions usernames, only keep those rows if matched_users and row["username"] not in matched_users: continue results.append({ 'url': row['videoUrl'], 'username': row['username'], 'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None, 'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None }) results = results[:10] if len(results) > 10 else results return results