subashpoudel's picture
Refined chatbot
6f57d05
import faiss
import ast
import pandas as pd
import numpy as np
from src.genai.utils.data_loader import caption_df, caption_index
from src.genai.utils.models_loader import embedding_model , encoding_model
from src.genai.utils.utils import clean_text
import tiktoken
class InfluencerRetrievalTool:
"""Tool for retrieving influencer data based on semantic search."""
def __init__(self):
self.df = caption_df
self.index = caption_index
def retrieve_for_analytics(self, business_details):
print('Generating embeddings..')
query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32')
print('Embeddings generated')
distances, indices = self.index.search(query_embedding, 10)
results = []
for idx in indices[0]:
row = self.df.iloc[idx]
results.append({
'url': row['videoUrl'],
'username': row['username'],
'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None,
'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None
})
return results
def retrieve_for_orchestration(self, query):
query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32')
print('Embeddings Generated')
faiss.normalize_L2(query_embedding)
print('Query embedded')
distances, indices = self.index.search(query_embedding, len(self.df))
similarity_threshold = 0.35
selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
if not selected:
return "No influencers found."
outer_list = []
for rank, (idx, sim) in enumerate(selected, 1):
row = self.df.iloc[idx]
inner_list = [
f"[{rank}]. The influencer name is: **{row['username']}** — Likes: **{row['likesCount']}**, Comments: **{row['commentCount']}**",
f"The branding or promotion done is:\n{row['visible_texts_or_brandings']}",
f"The details of product or service is:\n{row['product_or_service_details']}"
]
outer_list.append(inner_list)
cleaned_response = clean_text(str(outer_list))
print('response cleaned')
tokens = encoding_model.encode(cleaned_response)[:1000]
print('tokens got')
return encoding_model.decode(tokens)