Spaces:

DvorakInnovationAI
/

GenAI-FASTAPI

Sleeping

File size: 2,058 Bytes

6c655a3
be3a5c4
 
d98138c
1ce8b88
3e87e76
6c655a3
5c271a3
583f6dd
be3a5c4
ef9fa4b


import os
import numpy as np
from src.genai.utils.models_loader import  embedding_model 
import numpy as np
import faiss
import tiktoken
from src.genai.utils.data_loader import  caption_index , caption_df
from src.genai.utils.utils import clean_text

class Retrieval:
    def __init__(self, video_topic):
        self.video_topic = video_topic
        self.query_embedding =  np.array(embedding_model.embed_query(str(self.video_topic))).reshape(1, -1).astype('float32')
        faiss.normalize_L2(self.query_embedding)

    def influencers_data(self):
        top_k = len(caption_df)
        distances, indices = caption_index.search(self.query_embedding, top_k)

        similarity_threshold = 0.35
        selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]

        if not selected:
            return "No influencers found."

        outer_list = []
        for rank, (idx, sim) in enumerate(selected, 1):
            row = caption_df.iloc[idx]
            res = {
                'rank': rank,
                'username': row['username'],
                'visible_text_or_brandings': row['visible_texts_or_brandings'],
                'likesCount': row['likesCount'],
                'commentCount': row['commentCount'],
                'product_or_service_details': row['product_or_service_details'],
            }

            inner_list = [
                f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
                f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
                f"The details of product or service is:\n{res['product_or_service_details']}"
            ]
            outer_list.append(inner_list)

        cleaned_response = clean_text(str(outer_list))
        encoding = tiktoken.encoding_for_model('gpt-4o-mini')
        tokens = encoding.encode(cleaned_response)
        trimmed_response = tokens[:1000]
        return encoding.decode(trimmed_response)