Spaces:
Sleeping
Sleeping
| import faiss | |
| import ast | |
| import pandas as pd | |
| import numpy as np | |
| from src.genai.utils.data_loader import caption_df, caption_index | |
| from src.genai.utils.models_loader import embedding_model , encoding_model | |
| from src.genai.utils.utils import clean_text | |
| import tiktoken | |
| class InfluencerRetrievalTool: | |
| """Tool for retrieving influencer data based on semantic search.""" | |
| def __init__(self): | |
| self.df = caption_df | |
| self.index = caption_index | |
| def retrieve_for_analytics(self, business_details): | |
| print('Generating embeddings..') | |
| query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32') | |
| print('Embeddings generated') | |
| distances, indices = self.index.search(query_embedding, 10) | |
| results = [] | |
| for idx in indices[0]: | |
| row = self.df.iloc[idx] | |
| results.append({ | |
| 'url': row['videoUrl'], | |
| 'username': row['username'], | |
| 'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None, | |
| 'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None | |
| }) | |
| return results | |
| def retrieve_for_orchestration(self, query): | |
| query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32') | |
| print('Embeddings Generated') | |
| faiss.normalize_L2(query_embedding) | |
| print('Query embedded') | |
| distances, indices = self.index.search(query_embedding, len(self.df)) | |
| similarity_threshold = 0.35 | |
| selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold] | |
| if not selected: | |
| return "No influencers found." | |
| outer_list = [] | |
| for rank, (idx, sim) in enumerate(selected, 1): | |
| row = self.df.iloc[idx] | |
| inner_list = [ | |
| f"[{rank}]. The influencer name is: **{row['username']}** — Likes: **{row['likesCount']}**, Comments: **{row['commentCount']}**", | |
| f"The branding or promotion done is:\n{row['visible_texts_or_brandings']}", | |
| f"The details of product or service is:\n{row['product_or_service_details']}" | |
| ] | |
| outer_list.append(inner_list) | |
| cleaned_response = clean_text(str(outer_list)) | |
| print('response cleaned') | |
| tokens = encoding_model.encode(cleaned_response)[:1000] | |
| print('tokens got') | |
| return encoding_model.decode(tokens) | |