Spaces:
Sleeping
Sleeping
File size: 2,700 Bytes
93d50e5 a6a0614 93d50e5 a6a0614 93d50e5 a6a0614 93d50e5 0b2c9fd 93d50e5 a6a0614 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import re
import numpy as np
import faiss
from src.genai.utils.models_loader import embedding_model , encoding_model
from src.genai.utils.utils import clean_text
from src.genai.utils.data_loader import api_knowledge_df, api_index, caption_df , caption_index
from src.genai.utils.models_loader import embedding_model
import pandas as pd
class APIKnowledgeRetrieveTool:
def __init__(self):
self.df = api_knowledge_df
self.index = api_index
def retrieve(self,query):
query_embedding = np.array(embedding_model.embed_query(query)).reshape(1, -1).astype('float32')
distances, indices = self.index.search(query_embedding, 1)
row=self.df.iloc[indices[0]]
data = {'endpoint':row['endpoint'],
'method':row['method'],
'parameters':row['parameters']}
return data
class RetrieverBackup:
def __init__(self):
self.df = caption_df
self.index = caption_index
def _filter_dataset(self, query):
usernames = self.df["username"].dropna().unique()
matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)]
if matched_users:
filtered_df = self.df[self.df["username"].isin(matched_users)]
return filtered_df
else:
return self.df
def retrieve(self, query):
query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32')
faiss.normalize_L2(query_embedding)
# Search on full dataset (index is built on full df)
distances, indices = self.index.search(query_embedding, len(self.df))
similarity_threshold = 0.1
# Prepare matched usernames
usernames = self.df["username"].dropna().unique()
matched_users = [u for u in usernames if re.search(rf"\b{re.escape(u)}\b", query)]
results = []
rank = 1
for idx, sim in zip(indices[0], distances[0]):
if sim < similarity_threshold:
continue
row = self.df.iloc[idx]
# If query mentions usernames, only keep those rows
if matched_users and row["username"] not in matched_users:
continue
results.append({
'url': row['videoUrl'],
'username': row['username'],
'likesCount': int(row['likesCount']) if pd.notnull(row['likesCount']) else None,
'commentCount': int(row['commentCount']) if pd.notnull(row['commentCount']) else None
})
results = results[:10] if len(results) > 10 else results
return results
|