File size: 2,554 Bytes
6874dac
 
 
 
d98138c
6f57d05
583f6dd
6c655a3
6874dac
 
38cf703
 
 
 
 
 
 
 
6f57d05
38cf703
6f57d05
38cf703
 
 
 
 
 
 
 
 
 
 
e7801d4
38cf703
 
6f57d05
38cf703
6f57d05
38cf703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f57d05
 
 
 
6874dac
 
 
6c655a3
b4fb6ac
 
6874dac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import faiss
import ast
import pandas as pd
import numpy as np
from src.genai.utils.data_loader import caption_df, caption_index
from src.genai.utils.models_loader import embedding_model , encoding_model
from src.genai.utils.utils import clean_text
import tiktoken


class InfluencerRetrievalTool:
    """Tool for retrieving influencer data based on semantic search."""

    def __init__(self):
        self.df = caption_df
        self.index = caption_index

    def retrieve_for_analytics(self, business_details):
        print('Generating embeddings..')
        query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32')
        print('Embeddings generated')
        distances, indices = self.index.search(query_embedding, 10)
        results = []
        for idx in indices[0]:
            row = self.df.iloc[idx]
            results.append({
                'url': row['videoUrl'],
                'username': row['username'],
                'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None,
                'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None
            })
        return results

    def retrieve_for_orchestration(self, query):
        query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32')
        print('Embeddings Generated')
        faiss.normalize_L2(query_embedding)
        print('Query embedded')
        distances, indices = self.index.search(query_embedding, len(self.df))
        similarity_threshold = 0.35
        selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
        if not selected:
            return "No influencers found."

        outer_list = []
        for rank, (idx, sim) in enumerate(selected, 1):
            row = self.df.iloc[idx]
            inner_list = [
                f"[{rank}]. The influencer name is: **{row['username']}** — Likes: **{row['likesCount']}**, Comments: **{row['commentCount']}**",
                f"The branding or promotion done is:\n{row['visible_texts_or_brandings']}",
                f"The details of product or service is:\n{row['product_or_service_details']}"
            ]
            outer_list.append(inner_list)

        cleaned_response = clean_text(str(outer_list))
        print('response cleaned')
        tokens = encoding_model.encode(cleaned_response)[:1000]
        print('tokens got')
        return encoding_model.decode(tokens)