Spaces:
Sleeping
Sleeping
File size: 1,974 Bytes
6c655a3 be3a5c4 583f6dd 1ce8b88 3e87e76 6c655a3 b02093e 583f6dd be3a5c4 eb40d68 3e87e76 eb40d68 3e87e76 6c655a3 9e8b261 b02093e 6874dac 6c655a3 6874dac 6c655a3 6874dac 6c655a3 b02093e 6874dac 6c655a3 6874dac 6c655a3 6874dac 6c655a3 6874dac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import os
import numpy as np
from src.genai.utils.models_loader import embedding_model , llm
import numpy as np
import faiss
import tiktoken
from src.genai.utils.load_embeddings import caption_index , caption_df
from src.genai.utils.utils import clean_text
def retrieve_tool(video_topic):
'''
Always invoke this tool.
Retrieve influencer's data by semantic search of **video topic**.
'''
query_embedding = np.array(embedding_model.embed_query(str(video_topic))).reshape(1, -1).astype('float32')
faiss.normalize_L2(query_embedding)
top_k = len(caption_df)
distances, indices = caption_index.search(query_embedding, top_k)
similarity_threshold = 0.35
selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
if not selected:
return "No influencers found."
# === Format results ===
outer_list = []
for rank, (idx, sim) in enumerate(selected, 1):
row = caption_df.iloc[idx]
res = {
'rank': rank,
'username': row['username'],
'visible_text_or_brandings': row['visible_texts_or_brandings'],
'likesCount': row['likesCount'],
'commentCount': row['commentCount'],
'product_or_service_details': row['product_or_service_details'],
}
inner_list = [
f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
f"The details of product or service is:\n{res['product_or_service_details']}"
]
outer_list.append(inner_list)
cleaned_response = clean_text(str(outer_list))
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
tokens = encoding.encode(cleaned_response)
trimmed_response = tokens[:1000]
return encoding.decode(trimmed_response)
|