File size: 2,022 Bytes
3a3fe92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

from langchain_core.tools import tool
from .state import QueryFormatter
import pandas as pd
import numpy as np
import ast
import faiss
from utils.models_loader import ST

@tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
def retrieve_tool(video_topic):
    '''
    Always invoke this tool.
    Retrieve influencer's data by semantic search of **video topic**.
    '''
    # === Load CSV ===
    csv_path = 'extracted_data.csv'
    df = pd.read_csv(csv_path)

    # === Parse stored embeddings ===
    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    embeddings = np.vstack(df['embeddings'].values).astype('float32')

    # === Build FAISS index ===
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # === Load SentenceTransformer model ===

    # === Encode the query and search ===
    query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
    top_k=7
    distances, indices = index.search(query_embedding, top_k)



    # === Format results ===
    outer_list = []
    for i, idx in enumerate(indices[0]):
        res = {
            'rank': i + 1,
            'username': df.iloc[idx]['username'],
            'story': df.iloc[idx]['story'],
            'visible_text_or_brandings': df.iloc[idx]['story'],
            'likesCount': df.iloc[idx]['likesCount'],
            'commentCount': df.iloc[idx]['commentCount'],
        }

        inner_list = []
        inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
        inner_list.append(f"The story of that particular video is:\n{res['story']}")
        inner_list.append(f"The branding or promotion done is:\n{res['visible_text_or_brandings']}")

        outer_list.append(inner_list)

    return str(outer_list)