File size: 1,974 Bytes
6c655a3
be3a5c4
 
583f6dd
1ce8b88
3e87e76
6c655a3
b02093e
583f6dd
be3a5c4
eb40d68
3e87e76
 
eb40d68
3e87e76
6c655a3
 
9e8b261
b02093e
 
6874dac
6c655a3
 
6874dac
6c655a3
 
6874dac
 
 
6c655a3
b02093e
6874dac
6c655a3
 
 
 
 
 
6874dac
 
6c655a3
 
 
 
 
6874dac
 
6c655a3
 
 
 
 
6874dac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

import os
import numpy as np
from src.genai.utils.models_loader import  embedding_model , llm
import numpy as np
import faiss
import tiktoken
from src.genai.utils.load_embeddings import  caption_index , caption_df
from src.genai.utils.utils import clean_text

def retrieve_tool(video_topic):
    '''
    Always invoke this tool.
    Retrieve influencer's data by semantic search of **video topic**.
    '''
    query_embedding = np.array(embedding_model.embed_query(str(video_topic))).reshape(1, -1).astype('float32')
    faiss.normalize_L2(query_embedding)

    top_k = len(caption_df)
    distances, indices = caption_index.search(query_embedding, top_k)

    similarity_threshold = 0.35
    selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]

    if not selected:
        return "No influencers found."

    # === Format results ===
    outer_list = []
    for rank, (idx, sim) in enumerate(selected, 1):
        row = caption_df.iloc[idx]
        res = {
            'rank': rank,
            'username': row['username'],
            'visible_text_or_brandings': row['visible_texts_or_brandings'],
            'likesCount': row['likesCount'],
            'commentCount': row['commentCount'],
            'product_or_service_details': row['product_or_service_details'],
        }

        inner_list = [
            f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
            f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
            f"The details of product or service is:\n{res['product_or_service_details']}"
        ]
        outer_list.append(inner_list)

    cleaned_response = clean_text(str(outer_list))
    encoding = tiktoken.encoding_for_model('gpt-4o-mini')
    tokens = encoding.encode(cleaned_response)
    trimmed_response = tokens[:1000]
    return encoding.decode(trimmed_response)