from langchain_groq import ChatGroq from pydantic import BaseModel, Field from dotenv import load_dotenv load_dotenv() import os import numpy as np from langchain_core.tools import tool from utils.data_loader import load_influencer_data from utils.models_loader import ST , llm from sklearn.metrics.pairwise import cosine_similarity import numpy as np from langchain_core.messages import SystemMessage import re import faiss import ast import pandas as pd from .validators import QueryFormatter os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY') @tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.") def retrieve_tool(messages, business_details): ''' Always invoke this tool. Retrieve influencer's data by semantic search of **user messages** and the **business details**. ''' # === Load CSV === csv_path = 'extracted_data.csv' df = pd.read_csv(csv_path) # === Parse stored embeddings === df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x) embeddings = np.vstack(df['embeddings'].values).astype('float32') # === Build FAISS index === dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) # === Load SentenceTransformer model === # === Encode the query and search === query_embedding = ST.encode(str(messages)+str(business_details)).reshape(1, -1).astype('float32') top_k=3 distances, indices = index.search(query_embedding, top_k) # === Function to extract sections 1 and 6 === def extract_story_and_branding(full_story): full_story = full_story.replace('**6. Visible Texts or Brandings**', '**6. Visible Texts or Brandings:**') full_story = full_story.replace('**1. Story**', '**1. Story:**') pattern = ( r"\*\*1\. Story:\*\*(.*?)(?=\*\*\d+\.\s)" r".*?" r"\*\*6\. Visible Texts or Brandings:\*\*(.*?)(?=\*\*\d+\.\s|$)" ) match = re.search(pattern, full_story, re.DOTALL) if match: story_section = match.group(1).strip() branding_section = match.group(2).strip() return f"Story:\n{story_section}\n\nVisible Texts or Brandings:\n{branding_section}" else: return "Requested sections not found." # === Format results === outer_list = [] for i, idx in enumerate(indices[0]): res = { 'rank': i + 1, 'username': df.iloc[idx]['username'], 'agentic_story': df.iloc[idx]['agentic_story'], 'likesCount': df.iloc[idx]['likesCount'], 'commentCount': df.iloc[idx]['commentCount'], 'distance': distances[0][i] } inner_list = [] inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**") inner_list.append(f"The story of that particular video is:\n{extract_story_and_branding(res['agentic_story'])}") inner_list.append(f"Distance: {res['distance']:.4f}") outer_list.append(inner_list) return str(outer_list)