Spaces:
Sleeping
Sleeping
File size: 2,058 Bytes
6c655a3 be3a5c4 d98138c 1ce8b88 3e87e76 6c655a3 5c271a3 583f6dd be3a5c4 ef9fa4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import os
import numpy as np
from src.genai.utils.models_loader import embedding_model
import numpy as np
import faiss
import tiktoken
from src.genai.utils.data_loader import caption_index , caption_df
from src.genai.utils.utils import clean_text
class Retrieval:
def __init__(self, video_topic):
self.video_topic = video_topic
self.query_embedding = np.array(embedding_model.embed_query(str(self.video_topic))).reshape(1, -1).astype('float32')
faiss.normalize_L2(self.query_embedding)
def influencers_data(self):
top_k = len(caption_df)
distances, indices = caption_index.search(self.query_embedding, top_k)
similarity_threshold = 0.35
selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
if not selected:
return "No influencers found."
outer_list = []
for rank, (idx, sim) in enumerate(selected, 1):
row = caption_df.iloc[idx]
res = {
'rank': rank,
'username': row['username'],
'visible_text_or_brandings': row['visible_texts_or_brandings'],
'likesCount': row['likesCount'],
'commentCount': row['commentCount'],
'product_or_service_details': row['product_or_service_details'],
}
inner_list = [
f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
f"The details of product or service is:\n{res['product_or_service_details']}"
]
outer_list.append(inner_list)
cleaned_response = clean_text(str(outer_list))
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
tokens = encoding.encode(cleaned_response)
trimmed_response = tokens[:1000]
return encoding.decode(trimmed_response)
|