Spaces:

DvorakInnovationAI
/

GenAI-FASTAPI

Sleeping

App Files Files Community

GenAI-FASTAPI / src /genai /analytics_chatbot /utils /tools.py

subashpoudel

next commit

0b2c9fd 3 months ago

raw

history blame contribute delete

2.7 kB

	import re
	import numpy as np
	import faiss
	from src.genai.utils.models_loader import embedding_model , encoding_model
	from src.genai.utils.utils import clean_text
	from src.genai.utils.data_loader import api_knowledge_df, api_index, caption_df , caption_index
	from src.genai.utils.models_loader import embedding_model
	import pandas as pd

	class APIKnowledgeRetrieveTool:
	def __init__(self):
	self.df = api_knowledge_df
	self.index = api_index

	def retrieve(self,query):
	query_embedding = np.array(embedding_model.embed_query(query)).reshape(1, -1).astype('float32')
	distances, indices = self.index.search(query_embedding, 1)
	row=self.df.iloc[indices[0]]
	data = {'endpoint':row['endpoint'],
	'method':row['method'],
	'parameters':row['parameters']}
	return data

	class RetrieverBackup:
	def __init__(self):
	self.df = caption_df
	self.index = caption_index

	def _filter_dataset(self, query):
	usernames = self.df["username"].dropna().unique()
	matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)]
	if matched_users:
	filtered_df = self.df[self.df["username"].isin(matched_users)]
	return filtered_df
	else:
	return self.df


	def retrieve(self, query):
	query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32')
	faiss.normalize_L2(query_embedding)

	# Search on full dataset (index is built on full df)
	distances, indices = self.index.search(query_embedding, len(self.df))
	similarity_threshold = 0.1

	# Prepare matched usernames
	usernames = self.df["username"].dropna().unique()
	matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)]

	results = []
	rank = 1
	for idx, sim in zip(indices[0], distances[0]):
	if sim < similarity_threshold:
	continue

	row = self.df.iloc[idx]

	# If query mentions usernames, only keep those rows
	if matched_users and row["username"] not in matched_users:
	continue

	results.append({
	'url': row['videoUrl'],
	'username': row['username'],
	'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None,
	'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None
	})
	results = results[:10] if len(results) > 10 else results
	return results