Spaces:
Sleeping
Sleeping
File size: 2,022 Bytes
3a3fe92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from langchain_core.tools import tool
from .state import QueryFormatter
import pandas as pd
import numpy as np
import ast
import faiss
from utils.models_loader import ST
@tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
def retrieve_tool(video_topic):
'''
Always invoke this tool.
Retrieve influencer's data by semantic search of **video topic**.
'''
# === Load CSV ===
csv_path = 'extracted_data.csv'
df = pd.read_csv(csv_path)
# === Parse stored embeddings ===
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
embeddings = np.vstack(df['embeddings'].values).astype('float32')
# === Build FAISS index ===
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# === Load SentenceTransformer model ===
# === Encode the query and search ===
query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
top_k=7
distances, indices = index.search(query_embedding, top_k)
# === Format results ===
outer_list = []
for i, idx in enumerate(indices[0]):
res = {
'rank': i + 1,
'username': df.iloc[idx]['username'],
'story': df.iloc[idx]['story'],
'visible_text_or_brandings': df.iloc[idx]['story'],
'likesCount': df.iloc[idx]['likesCount'],
'commentCount': df.iloc[idx]['commentCount'],
}
inner_list = []
inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
inner_list.append(f"The story of that particular video is:\n{res['story']}")
inner_list.append(f"The branding or promotion done is:\n{res['visible_text_or_brandings']}")
outer_list.append(inner_list)
return str(outer_list)
|