Spaces:
Sleeping
Sleeping
File size: 2,462 Bytes
be3a5c4 ca75c57 93a5bf9 1ce8b88 3e87e76 6523fd0 be3a5c4 ca75c57 3e87e76 be3a5c4 3e87e76 6178c40 3e87e76 ca75c57 3e87e76 ca75c57 3e87e76 9e8b261 3e87e76 ca75c57 1ce8b88 3e87e76 9e8b261 3e87e76 1ce8b88 3e87e76 9e8b261 3e87e76 1ce8b88 3e87e76 1ce8b88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
from dotenv import load_dotenv
load_dotenv()
import os
import numpy as np
from langchain_core.tools import tool
from utils.data_loader import load_influencer_data
from utils.models_loader import ST , llm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain_core.messages import SystemMessage
import re
import faiss
import ast
import pandas as pd
from .state import QueryFormatter
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
@tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
def retrieve_tool(messages, business_details):
'''
Always invoke this tool.
Retrieve influencer's data by semantic search of **user messages** and the **business details**.
'''
# === Load CSV ===
csv_path = 'extracted_data.csv'
df = pd.read_csv(csv_path)
# === Parse stored embeddings ===
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
embeddings = np.vstack(df['embeddings'].values).astype('float32')
# === Build FAISS index ===
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# === Load SentenceTransformer model ===
# === Encode the query and search ===
query_embedding = ST.encode(str(messages)+str(business_details)).reshape(1, -1).astype('float32')
top_k=10
distances, indices = index.search(query_embedding, top_k)
# === Format results ===
outer_list = []
for i, idx in enumerate(indices[0]):
res = {
'rank': i + 1,
'username': df.iloc[idx]['username'],
'story': df.iloc[idx]['story'],
'visible_text_or_brandings': df.iloc[idx]['story'],
'likesCount': df.iloc[idx]['likesCount'],
'commentCount': df.iloc[idx]['commentCount'],
}
inner_list = []
inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
inner_list.append(f"The story of that particular video is:\n{res['story']}")
inner_list.append(f"The branding or promotion done is:\n{res['visible_text_or_brandings']}")
outer_list.append(inner_list)
return str(outer_list)
|