subashpoudel's picture
Refined embedding loader
b4fb6ac
raw
history blame
3.17 kB
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
from dotenv import load_dotenv
load_dotenv()
import os
import numpy as np
from langchain_core.tools import tool
# from utils.data_loader import load_influencer_data
from utils.models_loader import ST , llm
import numpy as np
from langchain_core.messages import SystemMessage
import re
import faiss
import ast
import pandas as pd
from .state import QueryFormatter
from utils.load_embeddings import embeddings , index
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
# @tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
def retrieve_tool(video_topic):
'''
Always invoke this tool.
Retrieve influencer's data by semantic search of **video topic**.
'''
df = pd.read_csv('extracted_data.csv')
query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
top_k=10
distances, indices = index.search(query_embedding, top_k)
# === Format results ===
outer_list = []
for i, idx in enumerate(indices[0]):
res = {
'rank': i + 1,
'username': df.iloc[idx]['username'],
'story': df.iloc[idx]['story'],
'visible_text_or_brandings': df.iloc[idx]['visible_texts_or_brandings'],
'likesCount': df.iloc[idx]['likesCount'],
'commentCount': df.iloc[idx]['commentCount'],
}
inner_list = []
inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
inner_list.append(f"The story of that particular video is:\n{res['story']}")
inner_list.append(f"The branding or promotion done is:\n{res['visible_text_or_brandings']}")
outer_list.append(inner_list)
return str(outer_list)
def retrieve_manual(video_topic):
'''
Always invoke this tool.
Retrieve influencer's data by semantic search of **video topic**.
'''
df = pd.read_csv('extracted_data.csv')
query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
top_k=5
distances, indices = index.search(query_embedding, top_k)
# === Format results ===
outer_list = []
for i, idx in enumerate(indices[0]):
res = {
'rank': i + 1,
'username': df.iloc[idx]['username'],
'story': df.iloc[idx]['story'],
'visible_text_or_brandings': df.iloc[idx]['visible_texts_or_brandings'],
'likesCount': df.iloc[idx]['likesCount'],
'commentCount': df.iloc[idx]['commentCount'],
}
inner_list = []
inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
inner_list.append(f"The story of that particular video is:\n{res['story']}")
inner_list.append(f"The branding or promotion done is:\n{res['visible_text_or_brandings']}")
outer_list.append(inner_list)
return str(outer_list)