Spaces:
Sleeping
Sleeping
| import re | |
| import numpy as np | |
| import faiss | |
| from src.genai.utils.models_loader import embedding_model , encoding_model | |
| from src.genai.utils.utils import clean_text | |
| from src.genai.utils.data_loader import api_knowledge_df, api_index, caption_df , caption_index | |
| from src.genai.utils.models_loader import embedding_model | |
| import pandas as pd | |
| class APIKnowledgeRetrieveTool: | |
| def __init__(self): | |
| self.df = api_knowledge_df | |
| self.index = api_index | |
| def retrieve(self,query): | |
| query_embedding = np.array(embedding_model.embed_query(query)).reshape(1, -1).astype('float32') | |
| distances, indices = self.index.search(query_embedding, 1) | |
| row=self.df.iloc[indices[0]] | |
| data = {'endpoint':row['endpoint'], | |
| 'method':row['method'], | |
| 'parameters':row['parameters']} | |
| return data | |
| class RetrieverBackup: | |
| def __init__(self): | |
| self.df = caption_df | |
| self.index = caption_index | |
| def _filter_dataset(self, query): | |
| usernames = self.df["username"].dropna().unique() | |
| matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)] | |
| if matched_users: | |
| filtered_df = self.df[self.df["username"].isin(matched_users)] | |
| return filtered_df | |
| else: | |
| return self.df | |
| def retrieve(self, query): | |
| query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32') | |
| faiss.normalize_L2(query_embedding) | |
| # Search on full dataset (index is built on full df) | |
| distances, indices = self.index.search(query_embedding, len(self.df)) | |
| similarity_threshold = 0.1 | |
| # Prepare matched usernames | |
| usernames = self.df["username"].dropna().unique() | |
| matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)] | |
| results = [] | |
| rank = 1 | |
| for idx, sim in zip(indices[0], distances[0]): | |
| if sim < similarity_threshold: | |
| continue | |
| row = self.df.iloc[idx] | |
| # If query mentions usernames, only keep those rows | |
| if matched_users and row["username"] not in matched_users: | |
| continue | |
| results.append({ | |
| 'url': row['videoUrl'], | |
| 'username': row['username'], | |
| 'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None, | |
| 'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None | |
| }) | |
| results = results[:10] if len(results) > 10 else results | |
| return results | |