Spaces:
Sleeping
Sleeping
| from langchain_core.tools import tool | |
| from .state import QueryFormatter | |
| import pandas as pd | |
| import numpy as np | |
| import ast | |
| import faiss | |
| import tiktoken | |
| from src.genai.utils.models_loader import embedding_model | |
| from src.genai.utils.data_loader import caption_index , caption_df, ideas_index , ideas_df | |
| from src.genai.utils.utils import clean_text | |
| class Retrieval: | |
| def __init__(self, business_details): | |
| self.business_details = business_details | |
| self.query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32') | |
| faiss.normalize_L2(self.query_embedding) | |
| def influencers_data(self): | |
| top_k = len(caption_df) | |
| distances, indices = caption_index.search(self.query_embedding, top_k) | |
| similarity_threshold = 0.35 | |
| selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold] | |
| if not selected: | |
| return "No influencers found." | |
| # === Format results === | |
| outer_list = [] | |
| for rank, (idx, sim) in enumerate(selected, 1): | |
| row = caption_df.iloc[idx] | |
| res = { | |
| 'rank': rank, | |
| 'username': row['username'], | |
| 'visible_text_or_brandings': row['visible_texts_or_brandings'], | |
| 'likesCount': row['likesCount'], | |
| 'commentCount': row['commentCount'], | |
| 'product_or_service_details': row['product_or_service_details'], | |
| } | |
| inner_list = [ | |
| f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**", | |
| f"The branding or promotion done is:\n{res['visible_text_or_brandings']}", | |
| f"The details of product or service is:\n{res['product_or_service_details']}" | |
| ] | |
| outer_list.append(inner_list) | |
| cleaned_response = clean_text(str(outer_list)) | |
| encoding = tiktoken.encoding_for_model('gpt-4o-mini') | |
| tokens = encoding.encode(cleaned_response) | |
| trimmed_response = tokens[:100] | |
| return encoding.decode(trimmed_response) | |
| def imdb_ideas(self): | |
| top_k = 4 | |
| distances, indices = ideas_index.search(self.query_embedding, top_k) | |
| outer_list = [] | |
| for rank, (idx, sim) in enumerate(zip(indices[0], distances[0]), 1): | |
| row = ideas_df.iloc[idx] | |
| res = { | |
| 'rank': rank, | |
| 'idea': row['idea'], | |
| } | |
| inner_list = [ | |
| f"Idea [{res['rank']}]: **{res['idea']}\n**", | |
| ] | |
| outer_list.append(inner_list) | |
| cleaned_response = clean_text(str(outer_list)) | |
| return str(cleaned_response) | |