Spaces:

subashdvorak
/

trygithubactions

Sleeping

App Files Files Community

trygithubactions / brainstroming_agent /utils /tools.py

subashpoudel

Changed the entire project structure

93a5bf9 8 months ago

raw

history blame

3.27 kB

	from langchain_groq import ChatGroq
	from pydantic import BaseModel, Field
	from dotenv import load_dotenv
	load_dotenv()
	import os
	import numpy as np
	from langchain_core.tools import tool
	from utils.data_loader import load_influencer_data
	from utils.models_loader import ST , llm
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	from langchain_core.messages import SystemMessage
	import re
	import faiss
	import ast
	import pandas as pd
	from .validators import QueryFormatter

	os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')

	@tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
	def retrieve_tool(messages, business_details):
	'''
	Always invoke this tool.
	Retrieve influencer's data by semantic search of user messages and the business details.
	'''
	# === Load CSV ===
	csv_path = 'extracted_data.csv'
	df = pd.read_csv(csv_path)

	# === Parse stored embeddings ===
	df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
	embeddings = np.vstack(df['embeddings'].values).astype('float32')

	# === Build FAISS index ===
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)

	# === Load SentenceTransformer model ===

	# === Encode the query and search ===
	query_embedding = ST.encode(str(messages)+str(business_details)).reshape(1, -1).astype('float32')
	top_k=3
	distances, indices = index.search(query_embedding, top_k)

	# === Function to extract sections 1 and 6 ===
	def extract_story_and_branding(full_story):
	full_story = full_story.replace('6. Visible Texts or Brandings', '6. Visible Texts or Brandings:')
	full_story = full_story.replace('1. Story', '1. Story:')

	pattern = (
	r"\\1\. Story:\\(.?)(?=\\*\d+\.\s)"
	r".*?"
	r"\\6\. Visible Texts or Brandings:\\(.?)(?=\\*\d+\.\s\|$)"
	)
	match = re.search(pattern, full_story, re.DOTALL)
	if match:
	story_section = match.group(1).strip()
	branding_section = match.group(2).strip()
	return f"Story:\n{story_section}\n\nVisible Texts or Brandings:\n{branding_section}"
	else:
	return "Requested sections not found."

	# === Format results ===
	outer_list = []
	for i, idx in enumerate(indices[0]):
	res = {
	'rank': i + 1,
	'username': df.iloc[idx]['username'],
	'agentic_story': df.iloc[idx]['agentic_story'],
	'likesCount': df.iloc[idx]['likesCount'],
	'commentCount': df.iloc[idx]['commentCount'],
	'distance': distances[0][i]
	}

	inner_list = []
	inner_list.append(f"[{res['rank']}]. The influencer name is: {res['username']} — Likes: {res['likesCount']}, Comments: {res['commentCount']}")
	inner_list.append(f"The story of that particular video is:\n{extract_story_and_branding(res['agentic_story'])}")
	inner_list.append(f"Distance: {res['distance']:.4f}")
	outer_list.append(inner_list)

	return str(outer_list)