Spaces:

subashdvorak
/

trygithubactions

Sleeping

App Files Files Community

subashpoudel commited on Jul 8, 2025

Commit

b4fb6ac

1 Parent(s): 6874dac

Refined embedding loader

Browse files

Files changed (17) hide show

__pycache__/main.cpython-312.pyc +0 -0
brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc +0 -0
brainstroming_agent/utils/tools.py +4 -34
ideation_agent/utils/__pycache__/tools.cpython-312.pyc +0 -0
ideation_agent/utils/tools.py +1 -13
main.py +1 -0
orchestration_agent/agent.py +19 -2
orchestration_agent/utils/nodes.py +2 -0
orchestration_agent/utils/prompts.py +4 -3
orchestration_agent/utils/tools.py +54 -14
orchestration_agent/utils/utils.py +3 -9
requirements.txt +3 -0
utils/__pycache__/data_loader.cpython-312.pyc +0 -0
utils/__pycache__/models_loader.cpython-312.pyc +0 -0
utils/data_loader.py +1 -2
utils/load_embeddings.py +18 -0
utils/models_loader.py +6 -30

__pycache__/main.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/main.cpython-312.pyc and b/__pycache__/main.cpython-312.pyc differ

brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc CHANGED Viewed

Binary files a/brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc and b/brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc differ

brainstroming_agent/utils/tools.py CHANGED Viewed

@@ -14,6 +14,7 @@ import faiss
 import ast
 import pandas as pd
 from .state import QueryFormatter
 os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
 # @tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
@@ -22,28 +23,12 @@ def retrieve_tool(video_topic):
     Always invoke this tool.
     Retrieve influencer's data by semantic search of **video topic**.
     '''
-    # === Load CSV ===
-    csv_path = 'extracted_data.csv'
-    df = pd.read_csv(csv_path)
-    # === Parse stored embeddings ===
-    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
-    embeddings = np.vstack(df['embeddings'].values).astype('float32')
-    # === Build FAISS index ===
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
-    # === Load SentenceTransformer model ===
-    # === Encode the query and search ===
     query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
     top_k=10
     distances, indices = index.search(query_embedding, top_k)
     # === Format results ===
     outer_list = []
     for i, idx in enumerate(indices[0]):
@@ -70,22 +55,7 @@ def retrieve_manual(video_topic):
     Always invoke this tool.
     Retrieve influencer's data by semantic search of **video topic**.
     '''
-    # === Load CSV ===
-    csv_path = 'extracted_data.csv'
-    df = pd.read_csv(csv_path)
-    # === Parse stored embeddings ===
-    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
-    embeddings = np.vstack(df['embeddings'].values).astype('float32')
-    # === Build FAISS index ===
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
-    # === Load SentenceTransformer model ===
-    # === Encode the query and search ===
     query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
     top_k=5
     distances, indices = index.search(query_embedding, top_k)

 import ast
 import pandas as pd
 from .state import QueryFormatter
+from utils.load_embeddings import  embeddings , index
 os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
 # @tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
     Always invoke this tool.
     Retrieve influencer's data by semantic search of **video topic**.
     '''
+    df = pd.read_csv('extracted_data.csv')
     query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
     top_k=10
     distances, indices = index.search(query_embedding, top_k)
     # === Format results ===
     outer_list = []
     for i, idx in enumerate(indices[0]):
     Always invoke this tool.
     Retrieve influencer's data by semantic search of **video topic**.
     '''
+    df = pd.read_csv('extracted_data.csv')
     query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
     top_k=5
     distances, indices = index.search(query_embedding, top_k)

ideation_agent/utils/__pycache__/tools.cpython-312.pyc CHANGED Viewed

Binary files a/ideation_agent/utils/__pycache__/tools.cpython-312.pyc and b/ideation_agent/utils/__pycache__/tools.cpython-312.pyc differ

ideation_agent/utils/tools.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 import ast
 import faiss
 from utils.models_loader import ST
 @tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
 def retrieve_tool(business_details):
@@ -17,24 +18,11 @@ def retrieve_tool(business_details):
     csv_path = 'extracted_data.csv'
     df = pd.read_csv(csv_path)
-    # === Parse stored embeddings ===
-    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
-    embeddings = np.vstack(df['embeddings'].values).astype('float32')
-    # === Build FAISS index ===
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
-    # === Load SentenceTransformer model ===
-    # === Encode the query and search ===
     query_embedding = ST.encode(str(business_details)).reshape(1, -1).astype('float32')
     top_k=7
     distances, indices = index.search(query_embedding, top_k)
     # === Format results ===
     outer_list = []
     for i, idx in enumerate(indices[0]):

 import ast
 import faiss
 from utils.models_loader import ST
+from utils.load_embeddings import embeddings , index
 @tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
 def retrieve_tool(business_details):
     csv_path = 'extracted_data.csv'
     df = pd.read_csv(csv_path)
     query_embedding = ST.encode(str(business_details)).reshape(1, -1).astype('float32')
     top_k=7
     distances, indices = index.search(query_embedding, top_k)
     # === Format results ===
     outer_list = []
     for i, idx in enumerate(indices[0]):

main.py CHANGED Viewed

@@ -163,6 +163,7 @@ def generate_final_story_endpoint():
     return {
         'response': final_story
     }
 # stored_data['final_story']= '''A cinematic journey follows a street magician\'s
 # metamorphosis from a mere trickster to a powerful performer, as he transforms his act with newfound physical strength, effortlessly executing death-defying stunts, and inspiring a captivated crowd to take action, all set against a
 # backdrop of urban grandeur and pulsing energy.'''

     return {
         'response': final_story
     }
 # stored_data['final_story']= '''A cinematic journey follows a street magician\'s
 # metamorphosis from a mere trickster to a powerful performer, as he transforms his act with newfound physical strength, effortlessly executing death-defying stunts, and inspiring a captivated crowd to take action, all set against a
 # backdrop of urban grandeur and pulsing energy.'''

orchestration_agent/agent.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .utils.nodes import tool_return_node, extract_user_reference_node
 from utils.models_loader import llm_gpt
 from .utils.state import ValidationFormatter
 from .utils.utils import caption_image , extract_latest_response_block
 import re
 from langchain_core.messages import SystemMessage
@@ -20,17 +21,33 @@ def orchestration_graph():
     workflow.add_edge('chatbot2', END)
     return workflow.compile(checkpointer=memory)
 def orchestration_chat(user_input: str, image_base64=[]):
     if len(image_base64)>0:
         caption_response = caption_image(image_base64, user_input)
         print('Caption Response:', caption_response)
     else:
         caption_response =''
     agent = orchestration_graph()
     config = {"configurable": {"thread_id": "orchestration-thread"}}
-    response = agent.invoke({"messages": [{'role':'human','content':user_input},{'role':'function','name':'information_of_image','content':caption_response}]}, config)['messages']
-    # template = [SystemMessage(content=final_validator_prompt)] + response
     response=llm_gpt.with_structured_output(ValidationFormatter).invoke(extract_latest_response_block(response))
     return response

 from utils.models_loader import llm_gpt
 from .utils.state import ValidationFormatter
 from .utils.utils import caption_image , extract_latest_response_block
+from .utils.tools import retrieve_data_for_orchestration
 import re
 from langchain_core.messages import SystemMessage
     workflow.add_edge('chatbot2', END)
     return workflow.compile(checkpointer=memory)
+user_input_history = []
 def orchestration_chat(user_input: str, image_base64=[]):
+    global user_input_history
+    user_input_history.append({'role': 'human', 'content': user_input})
     if len(image_base64)>0:
         caption_response = caption_image(image_base64, user_input)
+        user_input_history.append({'role': 'image_caption', 'content': caption_response})
         print('Caption Response:', caption_response)
     else:
         caption_response =''
+    if len(user_input_history)>4:
+        user_input_history=user_input_history[-2:]
+    print('Length of history', len(user_input_history))
+    query_for_retrieval = ' '.join(
+    [msg['content'] for msg in user_input_history if msg['role'] in ('human', 'image_caption')]
+    )
+    influencers_data = retrieve_data_for_orchestration(query_for_retrieval)
     agent = orchestration_graph()
     config = {"configurable": {"thread_id": "orchestration-thread"}}
+    response = agent.invoke({"messages": [{'role':'human','content':user_input},
+                                          {'role': 'function', 'name': 'data_of_influencers', 'content': influencers_data},
+                                          {'role':'function','name':'information_of_image','content':caption_response}]}, config)['messages']
+    print('Orchestrator Response', response)
     response=llm_gpt.with_structured_output(ValidationFormatter).invoke(extract_latest_response_block(response))
     return response

orchestration_agent/utils/nodes.py CHANGED Viewed

@@ -6,6 +6,8 @@ from .state import  ToolResponseFormatter, UserReferenceResponseFormatter
 def tool_return_node(state):
     history = state["messages"]
     template = [SystemMessage(content=tool_return_prompt)] + history
     # print(template)

 def tool_return_node(state):
+    if len(state["messages"]) > 23:
+        state["messages"] = state["messages"][-18:]
     history = state["messages"]
     template = [SystemMessage(content=tool_return_prompt)] + history
     # print(template)

orchestration_agent/utils/prompts.py CHANGED Viewed

@@ -34,7 +34,7 @@ Your job is to:
 1. **Read the user's message carefully** and identify their intent.
 2. **Return exactly two things**:
    - `tools`: a Python-style list of tool names (from the ordered list below) that match the user’s intent.
-   - `query_response`: a short, friendly, and helpful reply that aligns with the tools you've selected.
 Your `tools` output must:
 - Always be a **Python-style list**, even if only one tool is selected.
@@ -46,6 +46,7 @@ Your `query_response` must:
 - Be aligned with the selected tools.
 - Never say something is impossible if a tool exists for it.
 ---
 ### Available Tools (in execution order — use this order in your response):
@@ -99,14 +100,14 @@ Your `query_response` must:
 ### Influencer Assistant Note:
-You're also an intelligent assistant for brands looking to collaborate with influencers. If the user asks about influencer suggestions, trends, or insights (even without tool triggers), give friendly, useful replies using the influencer data provided to you.
 ---
 ### Output Format (always this exact format):
 "tools": ["tool_1", "tool_2"],  // or [] if nothing applies
-"query_response": "Short, clear, and friendly message to the user"
 """

 1. **Read the user's message carefully** and identify their intent.
 2. **Return exactly two things**:
    - `tools`: a Python-style list of tool names (from the ordered list below) that match the user’s intent.
+   - `query_response`: a short, friendly, and helpful reply that aligns with the tools you've selected. Also return the relevant data of influencers aligning with the uer query only if the data of influencers.
 Your `tools` output must:
 - Always be a **Python-style list**, even if only one tool is selected.
 - Be aligned with the selected tools.
 - Never say something is impossible if a tool exists for it.
 ---
 ### Available Tools (in execution order — use this order in your response):
 ### Influencer Assistant Note:
+You're also an intelligent assistant for brands looking to collaborate with influencers. If the user asks about influencer suggestions, trends, or insights , give detailed, useful replies using the influencer data provided to you.
 ---
 ### Output Format (always this exact format):
 "tools": ["tool_1", "tool_2"],  // or [] if nothing applies
+"query_response": "Short, clear, and friendly message to the user. Give the data of influencers too if provided to you."
 """

orchestration_agent/utils/tools.py CHANGED Viewed

@@ -4,25 +4,14 @@ import pandas as pd
 import numpy as np
 from utils.models_loader import ST
 import json
-def retrieve_tool(business_details):
     '''
     Always invoke this tool.
     Retrieve influencer's data by semantic search of **business details**.
     '''
-    # === Load CSV ===
-    csv_path = 'extracted_data.csv'
-    df = pd.read_csv(csv_path)
-    # === Parse stored embeddings ===
-    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
-    embeddings = np.vstack(df['embeddings'].values).astype('float32')
-    # === Build FAISS index ===
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
     # === Encode the query and search ===
     query_embedding = ST.encode(str(business_details)).reshape(1, -1).astype('float32')
@@ -42,4 +31,55 @@ def retrieve_tool(business_details):
     return results

 import numpy as np
 from utils.models_loader import ST
 import json
+from utils.load_embeddings import df, embeddings , index
+def retrieve_data_for_analytics(business_details):
     '''
     Always invoke this tool.
     Retrieve influencer's data by semantic search of **business details**.
     '''
+    df = pd.read_csv('extracted_data.csv')
     # === Encode the query and search ===
     query_embedding = ST.encode(str(business_details)).reshape(1, -1).astype('float32')
     return results
+def retrieve_data_for_orchestration(query):
+    df = pd.read_csv('extracted_data.csv')
+    # === Encode and normalize query ===
+    query_embedding = ST.encode(str(query)).reshape(1, -1).astype('float32')
+    faiss.normalize_L2(query_embedding)
+    # === Search with high top_k to filter later ===
+    top_k = len(df)
+    distances, indices = index.search(query_embedding, top_k)
+    # === Filter by similarity threshold (e.g., 0.70) ===
+    similarity_threshold = 0.60
+    selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
+    if not selected:
+        return "❌ No influencers found."
+    # === Format results ===
+    outer_list = []
+    for rank, (idx, sim) in enumerate(selected, 1):
+        row = df.iloc[idx]
+        res = {
+            'rank': rank,
+            'username': row['username'],
+            # 'story': row['story'],
+            'visible_text_or_brandings': row['visible_texts_or_brandings'],
+            'likesCount': row['likesCount'],
+            'commentCount': row['commentCount'],
+            'product_or_service_details': row['product_or_service_details'],
+        }
+        inner_list = [
+            f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
+            # f"The story of that particular video is:\n{res['story']}",
+            f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
+            f"The details of product or service is:\n{res['product_or_service_details']}"
+        ]
+        outer_list.append(inner_list)
+        # === Flatten and tokenize all lines ===
+    flat_lines = [line for sublist in outer_list for line in sublist]
+    tokens = ' '.join(flat_lines).split()
+        # === If >1000 tokens, trim from back, keeping top ranks ===
+    if len(tokens) > 1000:
+        tokens = tokens[:1000]
+        trimmed_response = ' '.join(tokens)
+        return trimmed_response
+    else:
+        return '\n\n'.join(flat_lines)

orchestration_agent/utils/utils.py CHANGED Viewed

@@ -6,8 +6,7 @@ import os
 from .prompts import captioning_prompt
 from utils.models_loader import llm
 from langchain_core.messages import FunctionMessage , AIMessage
-from .tools import retrieve_tool
-from.prompts import show_analytics_prompt
 def caption_image(image_base64,user_input):
@@ -41,12 +40,7 @@ def caption_image(image_base64,user_input):
 def show_analytics(business_details):
-    tool_response = retrieve_tool(str(business_details))
-    # template = [SystemMessage(content=show_analytics_prompt()),
-    #             HumanMessage(content=str(business_details)),
-    #             ToolMessage(content=tool_response, tool_call_id='analytics_id')]
-    # response = llm.invoke(template).content
     return tool_response
 def extract_latest_response_block(response):
@@ -65,5 +59,5 @@ def extract_latest_response_block(response):
                     break
                 else:
                     temp_block = []
     return latest_block

 from .prompts import captioning_prompt
 from utils.models_loader import llm
 from langchain_core.messages import FunctionMessage , AIMessage
+from .tools import  retrieve_data_for_analytics
 def caption_image(image_base64,user_input):
 def show_analytics(business_details):
+    tool_response = retrieve_data_for_analytics(str(business_details))
     return tool_response
 def extract_latest_response_block(response):
                     break
                 else:
                     temp_block = []
+    print('The latest block', latest_block)
     return latest_block

requirements.txt CHANGED Viewed

@@ -15,4 +15,7 @@ langmem
 streamlit
 requests
 langchain_openai

 streamlit
 requests
 langchain_openai
+nltk
+scikit-learn
+pandas

utils/__pycache__/data_loader.cpython-312.pyc CHANGED Viewed

Binary files a/utils/__pycache__/data_loader.cpython-312.pyc and b/utils/__pycache__/data_loader.cpython-312.pyc differ

utils/__pycache__/models_loader.cpython-312.pyc CHANGED Viewed

Binary files a/utils/__pycache__/models_loader.cpython-312.pyc and b/utils/__pycache__/models_loader.cpython-312.pyc differ

utils/data_loader.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from datasets import load_dataset
 dataset = load_dataset("subashdvorak/tiktok-formatted-story-v2", revision="embedded")
 data = dataset['train'].add_faiss_index('embeddings')
 def load_influencer_data():
     return data

 from datasets import load_dataset
 dataset = load_dataset("subashdvorak/tiktok-formatted-story-v2", revision="embedded")
 data = dataset['train'].add_faiss_index('embeddings')
 def load_influencer_data():
     return data

utils/load_embeddings.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import numpy as np
+import ast
+import faiss
+import pandas as pd
+def load_index_once():
+    df = pd.read_csv('extracted_data.csv')
+    df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
+    embeddings = np.vstack(df['embeddings'].values).astype('float32')
+    faiss.normalize_L2(embeddings)
+    index = faiss.IndexFlatIP(embeddings.shape[1])
+    index.add(embeddings)
+    return df, embeddings, index
+print('Loading Embeddings...........')
+# Load once on script start
+df, embeddings, index = load_index_once()

utils/models_loader.py CHANGED Viewed

@@ -51,28 +51,14 @@ class HFEmbeddingAPI:
         return np.array(embeddings[0]) if len(embeddings) == 1 else np.array(embeddings)
 # Instantiate your API-backed "SentenceTransformer"
-# ST = HFEmbeddingAPI(
-#     api_url="https://router.huggingface.co/hf-inference/models/mixedbread-ai/mxbai-embed-large-v1/pipeline/feature-extraction",
-#     token=os.environ.get('HUGGINGFACEHUB_ACCESS_TOKEN')
-# )
-ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
-# Initiated the models for ideation
-# ideator_llm = ChatGroq(
-#     model="llama-3.1-8b-instant",
-#     temperature=0.7,
-#     max_tokens=500,
-# )
-# critic_llm = ChatGroq(
-#     model="llama-3.3-70b-versatile",
-#     temperature=0.7,
-#     max_tokens=500,
-# )
 improver_llm = ChatOpenAI(
     model="gpt-4o-mini",
@@ -80,21 +66,11 @@ improver_llm = ChatOpenAI(
     max_tokens=500,
 )
-# improver_llm = ChatGroq(
-#     model="llama3-8b-8192",
-#     temperature=0.7,
-#     max_tokens=500,
-# )
 ideator_llm = llm
 critic_llm = llm
 validator_llm = llm
-# validator_llm = ChatGroq(
-#     model="llama-3.3-70b-versatile",
-#     temperature=0.7,
-#     max_tokens=500,
-# )

         return np.array(embeddings[0]) if len(embeddings) == 1 else np.array(embeddings)
 # Instantiate your API-backed "SentenceTransformer"
+ST = HFEmbeddingAPI(
+    api_url="https://router.huggingface.co/hf-inference/models/mixedbread-ai/mxbai-embed-large-v1/pipeline/feature-extraction",
+    token=os.environ.get('HUGGINGFACEHUB_ACCESS_TOKEN')
+)
+# ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
 improver_llm = ChatOpenAI(
     model="gpt-4o-mini",
     max_tokens=500,
 )
 ideator_llm = llm
 critic_llm = llm
 validator_llm = llm