File size: 2,341 Bytes
be3a5c4
 
 
 
 
 
ca75c57
93a5bf9
 
1ce8b88
 
 
3e87e76
 
 
6523fd0
be3a5c4
ca75c57
3e87e76
eb40d68
3e87e76
 
eb40d68
3e87e76
 
 
 
be3a5c4
3e87e76
 
 
6178c40
3e87e76
 
 
 
ca75c57
3e87e76
ca75c57
3e87e76
eb40d68
 
3e87e76
ca75c57
1ce8b88
 
3e87e76
 
 
 
 
 
9e8b261
 
3e87e76
 
 
1ce8b88
3e87e76
 
9e8b261
 
 
3e87e76
1ce8b88
3e87e76
1ce8b88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
from dotenv import load_dotenv
load_dotenv()
import os
import numpy as np
from langchain_core.tools import tool
from utils.data_loader import load_influencer_data
from utils.models_loader import  ST , llm
import numpy as np
from langchain_core.messages import SystemMessage
import re
import faiss
import ast
import pandas as pd
from .state import QueryFormatter

os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
@tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
def retrieve_tool(video_topic):
    '''
    Always invoke this tool.
    Retrieve influencer's data by semantic search of **video topic**.
    '''
    # === Load CSV ===
    csv_path = 'extracted_data.csv'
    df = pd.read_csv(csv_path)

    # === Parse stored embeddings ===
    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    embeddings = np.vstack(df['embeddings'].values).astype('float32')

    # === Build FAISS index ===
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # === Load SentenceTransformer model ===

    # === Encode the query and search ===
    query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
    top_k=7
    distances, indices = index.search(query_embedding, top_k)



    # === Format results ===
    outer_list = []
    for i, idx in enumerate(indices[0]):
        res = {
            'rank': i + 1,
            'username': df.iloc[idx]['username'],
            'story': df.iloc[idx]['story'],
            'visible_text_or_brandings': df.iloc[idx]['story'],
            'likesCount': df.iloc[idx]['likesCount'],
            'commentCount': df.iloc[idx]['commentCount'],
        }

        inner_list = []
        inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
        inner_list.append(f"The story of that particular video is:\n{res['story']}")
        inner_list.append(f"The branding or promotion done is:\n{res['visible_text_or_brandings']}")

        outer_list.append(inner_list)

    return str(outer_list)