from langchain_core.tools import tool from .state import QueryFormatter import pandas as pd import numpy as np import ast import faiss import tiktoken from src.genai.utils.models_loader import embedding_model from src.genai.utils.data_loader import caption_index , caption_df, ideas_index , ideas_df from src.genai.utils.utils import clean_text class Retrieval: def __init__(self, business_details): self.business_details = business_details self.query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32') faiss.normalize_L2(self.query_embedding) def influencers_data(self): top_k = len(caption_df) distances, indices = caption_index.search(self.query_embedding, top_k) similarity_threshold = 0.35 selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold] if not selected: return "No influencers found." # === Format results === outer_list = [] for rank, (idx, sim) in enumerate(selected, 1): row = caption_df.iloc[idx] res = { 'rank': rank, 'username': row['username'], 'visible_text_or_brandings': row['visible_texts_or_brandings'], 'likesCount': row['likesCount'], 'commentCount': row['commentCount'], 'product_or_service_details': row['product_or_service_details'], } inner_list = [ f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**", f"The branding or promotion done is:\n{res['visible_text_or_brandings']}", f"The details of product or service is:\n{res['product_or_service_details']}" ] outer_list.append(inner_list) cleaned_response = clean_text(str(outer_list)) encoding = tiktoken.encoding_for_model('gpt-4o-mini') tokens = encoding.encode(cleaned_response) trimmed_response = tokens[:100] return encoding.decode(trimmed_response) def imdb_ideas(self): top_k = 4 distances, indices = ideas_index.search(self.query_embedding, top_k) outer_list = [] for rank, (idx, sim) in enumerate(zip(indices[0], distances[0]), 1): row = ideas_df.iloc[idx] res = { 'rank': rank, 'idea': row['idea'], } inner_list = [ f"Idea [{res['rank']}]: **{res['idea']}\n**", ] outer_list.append(inner_list) cleaned_response = clean_text(str(outer_list)) return str(cleaned_response)