Spaces:

subashdvorak
/

trygithubactions

Sleeping

App Files Files Community

subashpoudel commited on May 19, 2025

Commit

3e87e76

1 Parent(s): 53ffc0f

Enhanced retrieval tool

Browse files

Files changed (1) hide show

my_agent/utils/tools.py +58 -30

my_agent/utils/tools.py CHANGED Viewed

@@ -11,46 +11,74 @@ from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 from langchain_core.messages import SystemMessage
 import re
 os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
-class StoryFormatter(BaseModel):
-    """Always use this tool to structure your response to the user."""
-    story: str=Field(description="How to introduce the scene and set the tone. What is happening in the scene? Describe key visuals and actions")
-    narration:str=Field(description="Suggestions for narration or voiceover that complements the visuals." )
-    text_in_the_Video:str=Field(description="Propose important text overlays for key moments.")
-    transitions:str=Field(description="Smooth transitions between scenes to maintain flow.")
-    emotional_tone:str=Field(description="The mood and energy of the scenes (e.g., excitement, calm, tension, joy")
-    key_visuals:str=Field(description="Important props, locations, sound effects, or background music to enhance the video.")
-class BrainstromTopicFormatter(BaseModel):
-  topic1:str=Field(description="First brainstorming topic of the story")
-  topic2:str=Field(description="Second brainstorming topic of the story")
-  topic3:str=Field(description="Third brainstorming topic of the story")
-  topic4:str=Field(description="Fourth brainstorming topic of the story")
-class QueryFormatter(BaseModel):
-    messages:str = Field(description="The user query")
-    business_details: str = Field(description="The details of the business of that user.")
-@tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
-def retrieve_tool(messages, business_details):
-    '''Always invoke this tool once.'''
-    print('The query for retrieval is:',messages)
-    embedded_query = ST.encode(str(messages)+str(business_details))  # Embed each topic
-    data = load_influencer_data()
-    scores, retrieved_examples = data.get_nearest_examples("embeddings", embedded_query, k=2)
-    # Construct a list of dictionaries for this topic
-    result = [{user: story} for user, story in zip(retrieved_examples['username'], retrieved_examples['agentic_story'])]
-    print('Tool response:',result)
-    return result

 import numpy as np
 from langchain_core.messages import SystemMessage
 import re
+import faiss
+import ast
+import pandas as pd
+from .validators import QueryFormatter
 os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
+@tool("influencer's data-retrieval-tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
+def retrieve_tool(messages, business_details):
+    '''
+    Always invoke this tool.
+    Retrieve influencer's data by semantic search of **user messages** and the **business details**.
+    '''
+    # === Load CSV ===
+    csv_path = 'extracted_data.csv'
+    df = pd.read_csv(csv_path)
+    # === Parse stored embeddings ===
+    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
+    embeddings = np.vstack(df['embeddings'].values).astype('float32')
+    # === Build FAISS index ===
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    # === Load SentenceTransformer model ===
+    # === Encode the query and search ===
+    query_embedding = ST.encode(str(messages)+str(business_details)).reshape(1, -1).astype('float32')
+    top_k=3
+    distances, indices = index.search(query_embedding, top_k)
+    # === Function to extract sections 1 and 6 ===
+    def extract_story_and_branding(full_story):
+        full_story = full_story.replace('**6. Visible Texts or Brandings**', '**6. Visible Texts or Brandings:**')
+        full_story = full_story.replace('**1. Story**', '**1. Story:**')
+        pattern = (
+            r"\*\*1\. Story:\*\*(.*?)(?=\*\*\d+\.\s)"
+            r".*?"
+            r"\*\*6\. Visible Texts or Brandings:\*\*(.*?)(?=\*\*\d+\.\s|$)"
+        )
+        match = re.search(pattern, full_story, re.DOTALL)
+        if match:
+            story_section = match.group(1).strip()
+            branding_section = match.group(2).strip()
+            return f"Story:\n{story_section}\n\nVisible Texts or Brandings:\n{branding_section}"
+        else:
+            return "Requested sections not found."
+    # === Format results ===
+    outer_list = []
+    for i, idx in enumerate(indices[0]):
+        res = {
+            'rank': i + 1,
+            'username': df.iloc[idx]['username'],
+            'agentic_story': df.iloc[idx]['agentic_story'],
+            'likesCount': df.iloc[idx]['likesCount'],
+            'commentCount': df.iloc[idx]['commentCount'],
+            'distance': distances[0][i]
+        }
+        inner_list = []
+        inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
+        inner_list.append(f"The story of that particular video is:\n{extract_story_and_branding(res['agentic_story'])}")
+        inner_list.append(f"Distance: {res['distance']:.4f}")
+        outer_list.append(inner_list)
+    return str(outer_list)