Spaces:
Sleeping
Sleeping
Commit
·
6c655a3
1
Parent(s):
fbc17f4
Next commit
Browse files- __pycache__/main.cpython-312.pyc +0 -0
- brainstroming_agent/utils/__pycache__/nodes.cpython-312.pyc +0 -0
- brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc +0 -0
- brainstroming_agent/utils/__pycache__/utils.cpython-312.pyc +0 -0
- brainstroming_agent/utils/nodes.py +1 -7
- brainstroming_agent/utils/tools.py +30 -62
- brainstroming_agent/utils/utils.py +3 -67
- ideation_agent/utils/__pycache__/tools.cpython-312.pyc +0 -0
- ideation_agent/utils/tools.py +31 -23
- orchestration_agent/utils/tools.py +19 -26
- orchestration_agent/utils/utils.py +3 -0
- requirements.txt +3 -0
- utils/__init__.py +0 -0
- utils/__pycache__/data_loader.cpython-312.pyc +0 -0
- utils/__pycache__/models_loader.cpython-312.pyc +0 -0
- utils/data_loader.py +5 -3
- utils/load_embeddings.py +4 -1
- utils/models_loader.py +11 -24
- utils/utils.py +34 -0
__pycache__/main.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/main.cpython-312.pyc and b/__pycache__/main.cpython-312.pyc differ
|
|
|
brainstroming_agent/utils/__pycache__/nodes.cpython-312.pyc
CHANGED
|
Binary files a/brainstroming_agent/utils/__pycache__/nodes.cpython-312.pyc and b/brainstroming_agent/utils/__pycache__/nodes.cpython-312.pyc differ
|
|
|
brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc
CHANGED
|
Binary files a/brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc and b/brainstroming_agent/utils/__pycache__/tools.cpython-312.pyc differ
|
|
|
brainstroming_agent/utils/__pycache__/utils.cpython-312.pyc
CHANGED
|
Binary files a/brainstroming_agent/utils/__pycache__/utils.cpython-312.pyc and b/brainstroming_agent/utils/__pycache__/utils.cpython-312.pyc differ
|
|
|
brainstroming_agent/utils/nodes.py
CHANGED
|
@@ -7,10 +7,8 @@ from utils.models_loader import llm , ST
|
|
| 7 |
from utils.data_loader import load_influencer_data
|
| 8 |
from groq import Groq
|
| 9 |
import os
|
| 10 |
-
from .prompts import image_captioning_prompt , initial_story_prompt , refined_story_prompt , brainstroming_prompt
|
| 11 |
from langgraph.prebuilt import create_react_agent
|
| 12 |
-
from pydantic import BaseModel , Field
|
| 13 |
-
from langchain_core.tools import tool
|
| 14 |
from .state import BrainstromTopicFormatter
|
| 15 |
|
| 16 |
|
|
@@ -75,9 +73,6 @@ def retrieve(state: State) -> State:
|
|
| 75 |
return state
|
| 76 |
|
| 77 |
def generate_story(state:State)-> State:
|
| 78 |
-
retrievals_from_tool = retrieve_tool(state.idea)
|
| 79 |
-
# tools=[retrieve_tool]
|
| 80 |
-
|
| 81 |
react_agent=create_react_agent(
|
| 82 |
model=llm,
|
| 83 |
tools=[]
|
|
@@ -102,7 +97,6 @@ def generate_story(state:State)-> State:
|
|
| 102 |
response = response['messages'][-1].content
|
| 103 |
print('The genrated story: ', response)
|
| 104 |
state.stories.append(response)
|
| 105 |
-
# return State(messages="Story generated", topic=state.topic,stories=state.stories)
|
| 106 |
return state
|
| 107 |
|
| 108 |
|
|
|
|
| 7 |
from utils.data_loader import load_influencer_data
|
| 8 |
from groq import Groq
|
| 9 |
import os
|
| 10 |
+
from .prompts import image_captioning_prompt , initial_story_prompt , refined_story_prompt , brainstroming_prompt
|
| 11 |
from langgraph.prebuilt import create_react_agent
|
|
|
|
|
|
|
| 12 |
from .state import BrainstromTopicFormatter
|
| 13 |
|
| 14 |
|
|
|
|
| 73 |
return state
|
| 74 |
|
| 75 |
def generate_story(state:State)-> State:
|
|
|
|
|
|
|
|
|
|
| 76 |
react_agent=create_react_agent(
|
| 77 |
model=llm,
|
| 78 |
tools=[]
|
|
|
|
| 97 |
response = response['messages'][-1].content
|
| 98 |
print('The genrated story: ', response)
|
| 99 |
state.stories.append(response)
|
|
|
|
| 100 |
return state
|
| 101 |
|
| 102 |
|
brainstroming_agent/utils/tools.py
CHANGED
|
@@ -1,85 +1,53 @@
|
|
| 1 |
-
|
| 2 |
-
from pydantic import BaseModel, Field
|
| 3 |
-
from dotenv import load_dotenv
|
| 4 |
-
load_dotenv()
|
| 5 |
import os
|
| 6 |
import numpy as np
|
| 7 |
-
from
|
| 8 |
-
# from utils.data_loader import load_influencer_data
|
| 9 |
-
from utils.models_loader import ST , llm
|
| 10 |
import numpy as np
|
| 11 |
-
from langchain_core.messages import SystemMessage
|
| 12 |
-
import re
|
| 13 |
import faiss
|
| 14 |
-
import
|
| 15 |
-
import
|
| 16 |
-
from .
|
| 17 |
-
from utils.load_embeddings import embeddings , index
|
| 18 |
|
| 19 |
-
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
|
| 20 |
-
# @tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
|
| 21 |
def retrieve_tool(video_topic):
|
| 22 |
'''
|
| 23 |
Always invoke this tool.
|
| 24 |
Retrieve influencer's data by semantic search of **video topic**.
|
| 25 |
'''
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
|
| 29 |
-
top_k=10
|
| 30 |
-
distances, indices = index.search(query_embedding, top_k)
|
| 31 |
-
|
| 32 |
-
# === Format results ===
|
| 33 |
-
outer_list = []
|
| 34 |
-
for i, idx in enumerate(indices[0]):
|
| 35 |
-
res = {
|
| 36 |
-
'rank': i + 1,
|
| 37 |
-
'username': df.iloc[idx]['username'],
|
| 38 |
-
'story': df.iloc[idx]['story'],
|
| 39 |
-
'visible_text_or_brandings': df.iloc[idx]['visible_texts_or_brandings'],
|
| 40 |
-
'likesCount': df.iloc[idx]['likesCount'],
|
| 41 |
-
'commentCount': df.iloc[idx]['commentCount'],
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
inner_list = []
|
| 45 |
-
inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
|
| 46 |
-
inner_list.append(f"The story of that particular video is:\n{res['story']}")
|
| 47 |
-
inner_list.append(f"The branding or promotion done is:\n{res['visible_text_or_brandings']}")
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
return str(outer_list)
|
| 52 |
-
|
| 53 |
-
def retrieve_manual(video_topic):
|
| 54 |
-
'''
|
| 55 |
-
Always invoke this tool.
|
| 56 |
-
Retrieve influencer's data by semantic search of **video topic**.
|
| 57 |
-
'''
|
| 58 |
-
df = pd.read_csv('extracted_data.csv')
|
| 59 |
-
query_embedding = ST.encode(str(video_topic)).reshape(1, -1).astype('float32')
|
| 60 |
-
top_k=5
|
| 61 |
distances, indices = index.search(query_embedding, top_k)
|
| 62 |
|
|
|
|
|
|
|
| 63 |
|
|
|
|
|
|
|
| 64 |
|
| 65 |
# === Format results ===
|
| 66 |
outer_list = []
|
| 67 |
-
for
|
|
|
|
| 68 |
res = {
|
| 69 |
-
'rank':
|
| 70 |
-
'username':
|
| 71 |
-
'
|
| 72 |
-
'
|
| 73 |
-
'
|
| 74 |
-
'
|
| 75 |
}
|
| 76 |
|
| 77 |
-
inner_list = [
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
outer_list.append(inner_list)
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
|
|
|
| 1 |
+
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
import numpy as np
|
| 4 |
+
from utils.models_loader import embedding_model , llm
|
|
|
|
|
|
|
| 5 |
import numpy as np
|
|
|
|
|
|
|
| 6 |
import faiss
|
| 7 |
+
import tiktoken
|
| 8 |
+
from utils.load_embeddings import index , df
|
| 9 |
+
from utils.utils import clean_text
|
|
|
|
| 10 |
|
|
|
|
|
|
|
| 11 |
def retrieve_tool(video_topic):
|
| 12 |
'''
|
| 13 |
Always invoke this tool.
|
| 14 |
Retrieve influencer's data by semantic search of **video topic**.
|
| 15 |
'''
|
| 16 |
+
query_embedding = np.array(embedding_model.embed_query(str(video_topic))).reshape(1, -1).astype('float32')
|
| 17 |
+
faiss.normalize_L2(query_embedding)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
top_k = len(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
distances, indices = index.search(query_embedding, top_k)
|
| 21 |
|
| 22 |
+
similarity_threshold = 0.35
|
| 23 |
+
selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
|
| 24 |
|
| 25 |
+
if not selected:
|
| 26 |
+
return "No influencers found."
|
| 27 |
|
| 28 |
# === Format results ===
|
| 29 |
outer_list = []
|
| 30 |
+
for rank, (idx, sim) in enumerate(selected, 1):
|
| 31 |
+
row = df.iloc[idx]
|
| 32 |
res = {
|
| 33 |
+
'rank': rank,
|
| 34 |
+
'username': row['username'],
|
| 35 |
+
'visible_text_or_brandings': row['visible_texts_or_brandings'],
|
| 36 |
+
'likesCount': row['likesCount'],
|
| 37 |
+
'commentCount': row['commentCount'],
|
| 38 |
+
'product_or_service_details': row['product_or_service_details'],
|
| 39 |
}
|
| 40 |
|
| 41 |
+
inner_list = [
|
| 42 |
+
f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
|
| 43 |
+
f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
|
| 44 |
+
f"The details of product or service is:\n{res['product_or_service_details']}"
|
| 45 |
+
]
|
| 46 |
outer_list.append(inner_list)
|
| 47 |
|
| 48 |
+
cleaned_response = clean_text(str(outer_list))
|
| 49 |
+
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
|
| 50 |
+
tokens = encoding.encode(cleaned_response)
|
| 51 |
+
trimmed_response = tokens[:1000]
|
| 52 |
+
return encoding.decode(trimmed_response)
|
| 53 |
|
brainstroming_agent/utils/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
|
| 2 |
from langchain_core.messages import SystemMessage, ToolMessage, HumanMessage
|
| 3 |
-
from .tools import retrieve_tool
|
| 4 |
import base64
|
| 5 |
from PIL import Image
|
| 6 |
from io import BytesIO
|
|
@@ -11,14 +11,7 @@ import os
|
|
| 11 |
from langgraph.prebuilt import create_react_agent
|
| 12 |
import pandas as pd
|
| 13 |
from datasets import load_dataset
|
| 14 |
-
import
|
| 15 |
-
import faiss
|
| 16 |
-
import re
|
| 17 |
-
import numpy as np
|
| 18 |
-
from utils.models_loader import ST , llm
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
|
| 23 |
def generate_final_story(final_state):
|
| 24 |
if 'preferred_topics' in final_state:
|
|
@@ -41,7 +34,7 @@ def generate_final_story(final_state):
|
|
| 41 |
return final_state['stories'][-1]
|
| 42 |
else:
|
| 43 |
template = final_story_prompt(final_state)
|
| 44 |
-
influencers_data =
|
| 45 |
messages = [SystemMessage(content=template),
|
| 46 |
ToolMessage(content=f'''The business details is:\n{str(final_state)}\nThe data of influencers is:\n{influencers_data}''',tool_call_id='final_story_tool')]
|
| 47 |
react_agent=create_react_agent(
|
|
@@ -130,64 +123,7 @@ def save_to_db(business_details):
|
|
| 130 |
matched_df = df[df.apply(row_matches, axis=1)]
|
| 131 |
matched_df.to_csv('extracted_data.csv')
|
| 132 |
|
| 133 |
-
def manual_retrieval(messages, business_details):
|
| 134 |
-
# === Load CSV ===
|
| 135 |
-
csv_path = 'extracted_data.csv'
|
| 136 |
-
df = pd.read_csv(csv_path)
|
| 137 |
-
|
| 138 |
-
# === Parse stored embeddings ===
|
| 139 |
-
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
| 140 |
-
embeddings = np.vstack(df['embeddings'].values).astype('float32')
|
| 141 |
-
|
| 142 |
-
# === Build FAISS index ===
|
| 143 |
-
dimension = embeddings.shape[1]
|
| 144 |
-
index = faiss.IndexFlatL2(dimension)
|
| 145 |
-
index.add(embeddings)
|
| 146 |
|
| 147 |
-
# === Load SentenceTransformer model ===
|
| 148 |
-
|
| 149 |
-
# === Encode the query and search ===
|
| 150 |
-
query_embedding = ST.encode(str(messages)+str(business_details)).reshape(1, -1).astype('float32')
|
| 151 |
-
top_k=3
|
| 152 |
-
distances, indices = index.search(query_embedding, top_k)
|
| 153 |
-
|
| 154 |
-
# === Function to extract sections 1 and 6 ===
|
| 155 |
-
def extract_story_and_branding(full_story):
|
| 156 |
-
full_story = full_story.replace('**6. Visible Texts or Brandings**', '**6. Visible Texts or Brandings:**')
|
| 157 |
-
full_story = full_story.replace('**1. Story**', '**1. Story:**')
|
| 158 |
-
|
| 159 |
-
pattern = (
|
| 160 |
-
r"\*\*1\. Story:\*\*(.*?)(?=\*\*\d+\.\s)"
|
| 161 |
-
r".*?"
|
| 162 |
-
r"\*\*6\. Visible Texts or Brandings:\*\*(.*?)(?=\*\*\d+\.\s|$)"
|
| 163 |
-
)
|
| 164 |
-
match = re.search(pattern, full_story, re.DOTALL)
|
| 165 |
-
if match:
|
| 166 |
-
story_section = match.group(1).strip()
|
| 167 |
-
branding_section = match.group(2).strip()
|
| 168 |
-
return f"Story:\n{story_section}\n\nVisible Texts or Brandings:\n{branding_section}"
|
| 169 |
-
else:
|
| 170 |
-
return "Requested sections not found."
|
| 171 |
-
|
| 172 |
-
# === Format results ===
|
| 173 |
-
outer_list = []
|
| 174 |
-
for i, idx in enumerate(indices[0]):
|
| 175 |
-
res = {
|
| 176 |
-
'rank': i + 1,
|
| 177 |
-
'username': df.iloc[idx]['username'],
|
| 178 |
-
'agentic_story': df.iloc[idx]['agentic_story'],
|
| 179 |
-
'likesCount': df.iloc[idx]['likesCount'],
|
| 180 |
-
'commentCount': df.iloc[idx]['commentCount'],
|
| 181 |
-
'distance': distances[0][i]
|
| 182 |
-
}
|
| 183 |
-
|
| 184 |
-
inner_list = []
|
| 185 |
-
inner_list.append(f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**")
|
| 186 |
-
inner_list.append(f"The story of that particular video is:\n{extract_story_and_branding(res['agentic_story'])}")
|
| 187 |
-
inner_list.append(f"Distance: {res['distance']:.4f}")
|
| 188 |
-
outer_list.append(inner_list)
|
| 189 |
-
|
| 190 |
-
return str(outer_list)
|
| 191 |
|
| 192 |
|
| 193 |
|
|
|
|
| 1 |
|
| 2 |
from langchain_core.messages import SystemMessage, ToolMessage, HumanMessage
|
| 3 |
+
from .tools import retrieve_tool
|
| 4 |
import base64
|
| 5 |
from PIL import Image
|
| 6 |
from io import BytesIO
|
|
|
|
| 11 |
from langgraph.prebuilt import create_react_agent
|
| 12 |
import pandas as pd
|
| 13 |
from datasets import load_dataset
|
| 14 |
+
from utils.models_loader import llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def generate_final_story(final_state):
|
| 17 |
if 'preferred_topics' in final_state:
|
|
|
|
| 34 |
return final_state['stories'][-1]
|
| 35 |
else:
|
| 36 |
template = final_story_prompt(final_state)
|
| 37 |
+
influencers_data = retrieve_tool(final_state)
|
| 38 |
messages = [SystemMessage(content=template),
|
| 39 |
ToolMessage(content=f'''The business details is:\n{str(final_state)}\nThe data of influencers is:\n{influencers_data}''',tool_call_id='final_story_tool')]
|
| 40 |
react_agent=create_react_agent(
|
|
|
|
| 123 |
matched_df = df[df.apply(row_matches, axis=1)]
|
| 124 |
matched_df.to_csv('extracted_data.csv')
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
|
ideation_agent/utils/__pycache__/tools.cpython-312.pyc
CHANGED
|
Binary files a/ideation_agent/utils/__pycache__/tools.cpython-312.pyc and b/ideation_agent/utils/__pycache__/tools.cpython-312.pyc differ
|
|
|
ideation_agent/utils/tools.py
CHANGED
|
@@ -5,8 +5,10 @@ import pandas as pd
|
|
| 5 |
import numpy as np
|
| 6 |
import ast
|
| 7 |
import faiss
|
| 8 |
-
|
| 9 |
-
from utils.
|
|
|
|
|
|
|
| 10 |
|
| 11 |
@tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
|
| 12 |
def retrieve_tool(business_details):
|
|
@@ -14,34 +16,40 @@ def retrieve_tool(business_details):
|
|
| 14 |
Always invoke this tool.
|
| 15 |
Retrieve influencer's data by semantic search of **business details**.
|
| 16 |
'''
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
df = pd.read_csv(csv_path)
|
| 20 |
|
| 21 |
-
|
| 22 |
-
query_embedding = ST.encode(str(business_details)).reshape(1, -1).astype('float32')
|
| 23 |
-
top_k=7
|
| 24 |
distances, indices = index.search(query_embedding, top_k)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# === Format results ===
|
| 27 |
outer_list = []
|
| 28 |
-
for
|
|
|
|
| 29 |
res = {
|
| 30 |
-
'rank':
|
| 31 |
-
'username':
|
| 32 |
-
'
|
| 33 |
-
'
|
| 34 |
-
'
|
| 35 |
-
'
|
| 36 |
-
'product_or_service_details': df.iloc[idx]['product_or_service_details'],
|
| 37 |
-
|
| 38 |
}
|
| 39 |
|
| 40 |
-
inner_list = [
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
outer_list.append(inner_list)
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import ast
|
| 7 |
import faiss
|
| 8 |
+
import tiktoken
|
| 9 |
+
from utils.models_loader import embedding_model
|
| 10 |
+
from utils.load_embeddings import embeddings , index , df
|
| 11 |
+
from utils.utils import clean_text
|
| 12 |
|
| 13 |
@tool("influencers_data_retrieval_tool", args_schema=QueryFormatter, return_direct=False,description="Retrieve influencer-related data for a given query.")
|
| 14 |
def retrieve_tool(business_details):
|
|
|
|
| 16 |
Always invoke this tool.
|
| 17 |
Retrieve influencer's data by semantic search of **business details**.
|
| 18 |
'''
|
| 19 |
+
query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32')
|
| 20 |
+
faiss.normalize_L2(query_embedding)
|
|
|
|
| 21 |
|
| 22 |
+
top_k = len(df)
|
|
|
|
|
|
|
| 23 |
distances, indices = index.search(query_embedding, top_k)
|
| 24 |
|
| 25 |
+
similarity_threshold = 0.35
|
| 26 |
+
selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
|
| 27 |
+
|
| 28 |
+
if not selected:
|
| 29 |
+
return "No influencers found."
|
| 30 |
+
|
| 31 |
# === Format results ===
|
| 32 |
outer_list = []
|
| 33 |
+
for rank, (idx, sim) in enumerate(selected, 1):
|
| 34 |
+
row = df.iloc[idx]
|
| 35 |
res = {
|
| 36 |
+
'rank': rank,
|
| 37 |
+
'username': row['username'],
|
| 38 |
+
'visible_text_or_brandings': row['visible_texts_or_brandings'],
|
| 39 |
+
'likesCount': row['likesCount'],
|
| 40 |
+
'commentCount': row['commentCount'],
|
| 41 |
+
'product_or_service_details': row['product_or_service_details'],
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
|
| 44 |
+
inner_list = [
|
| 45 |
+
f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
|
| 46 |
+
f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
|
| 47 |
+
f"The details of product or service is:\n{res['product_or_service_details']}"
|
| 48 |
+
]
|
| 49 |
outer_list.append(inner_list)
|
| 50 |
|
| 51 |
+
cleaned_response = clean_text(str(outer_list))
|
| 52 |
+
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
|
| 53 |
+
tokens = encoding.encode(cleaned_response)
|
| 54 |
+
trimmed_response = tokens[:1000]
|
| 55 |
+
return encoding.decode(trimmed_response)
|
orchestration_agent/utils/tools.py
CHANGED
|
@@ -2,51 +2,50 @@ import faiss
|
|
| 2 |
import ast
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
-
from utils.models_loader import ST
|
| 6 |
-
import json
|
| 7 |
from utils.load_embeddings import df, embeddings , index
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def retrieve_data_for_analytics(business_details):
|
| 10 |
'''
|
| 11 |
Always invoke this tool.
|
| 12 |
Retrieve influencer's data by semantic search of **business details**.
|
| 13 |
'''
|
| 14 |
-
df = pd.read_csv('extracted_data.csv')
|
| 15 |
|
| 16 |
# === Encode the query and search ===
|
| 17 |
-
query_embedding =
|
| 18 |
-
top_k =
|
| 19 |
distances, indices = index.search(query_embedding, top_k)
|
| 20 |
|
| 21 |
# === Format results ===
|
| 22 |
results = []
|
| 23 |
for i, idx in enumerate(indices[0]):
|
|
|
|
|
|
|
| 24 |
res = {
|
| 25 |
'url': df.iloc[idx]['videoUrl'],
|
| 26 |
'username': df.iloc[idx]['username'],
|
| 27 |
-
'likesCount': int(
|
| 28 |
-
|
| 29 |
}
|
| 30 |
results.append(res)
|
| 31 |
|
| 32 |
return results
|
| 33 |
|
| 34 |
def retrieve_data_for_orchestration(query):
|
| 35 |
-
|
| 36 |
-
# === Encode and normalize query ===
|
| 37 |
-
query_embedding = ST.encode(str(query)).reshape(1, -1).astype('float32')
|
| 38 |
faiss.normalize_L2(query_embedding)
|
| 39 |
|
| 40 |
-
# === Search with high top_k to filter later ===
|
| 41 |
top_k = len(df)
|
| 42 |
distances, indices = index.search(query_embedding, top_k)
|
| 43 |
|
| 44 |
-
|
| 45 |
-
similarity_threshold = 0.60
|
| 46 |
selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
|
| 47 |
|
| 48 |
if not selected:
|
| 49 |
-
return "
|
| 50 |
|
| 51 |
# === Format results ===
|
| 52 |
outer_list = []
|
|
@@ -55,7 +54,6 @@ def retrieve_data_for_orchestration(query):
|
|
| 55 |
res = {
|
| 56 |
'rank': rank,
|
| 57 |
'username': row['username'],
|
| 58 |
-
# 'story': row['story'],
|
| 59 |
'visible_text_or_brandings': row['visible_texts_or_brandings'],
|
| 60 |
'likesCount': row['likesCount'],
|
| 61 |
'commentCount': row['commentCount'],
|
|
@@ -64,22 +62,17 @@ def retrieve_data_for_orchestration(query):
|
|
| 64 |
|
| 65 |
inner_list = [
|
| 66 |
f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
|
| 67 |
-
# f"The story of that particular video is:\n{res['story']}",
|
| 68 |
f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
|
| 69 |
f"The details of product or service is:\n{res['product_or_service_details']}"
|
| 70 |
]
|
| 71 |
outer_list.append(inner_list)
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
tokens =
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
# === If >1000 tokens, trim from back, keeping top ranks ===
|
| 78 |
-
if len(tokens) > 1000:
|
| 79 |
-
tokens = tokens[:1000]
|
| 80 |
-
trimmed_response = ' '.join(tokens)
|
| 81 |
-
return trimmed_response
|
| 82 |
-
else:
|
| 83 |
-
return '\n\n'.join(flat_lines)
|
| 84 |
|
| 85 |
|
|
|
|
| 2 |
import ast
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
|
|
|
|
|
|
| 5 |
from utils.load_embeddings import df, embeddings , index
|
| 6 |
+
from utils.models_loader import embedding_model
|
| 7 |
+
from utils.utils import clean_text
|
| 8 |
+
import tiktoken
|
| 9 |
|
| 10 |
def retrieve_data_for_analytics(business_details):
|
| 11 |
'''
|
| 12 |
Always invoke this tool.
|
| 13 |
Retrieve influencer's data by semantic search of **business details**.
|
| 14 |
'''
|
| 15 |
+
# df = pd.read_csv('extracted_data.csv')
|
| 16 |
|
| 17 |
# === Encode the query and search ===
|
| 18 |
+
query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32')
|
| 19 |
+
top_k = 10
|
| 20 |
distances, indices = index.search(query_embedding, top_k)
|
| 21 |
|
| 22 |
# === Format results ===
|
| 23 |
results = []
|
| 24 |
for i, idx in enumerate(indices[0]):
|
| 25 |
+
likes = df.iloc[idx]['likesCount']
|
| 26 |
+
comments = df.iloc[idx]['commentCount']
|
| 27 |
res = {
|
| 28 |
'url': df.iloc[idx]['videoUrl'],
|
| 29 |
'username': df.iloc[idx]['username'],
|
| 30 |
+
'likesCount': int(likes) if pd.notnull(likes) else None,
|
| 31 |
+
'commentCount': int(comments) if pd.notnull(comments) else None
|
| 32 |
}
|
| 33 |
results.append(res)
|
| 34 |
|
| 35 |
return results
|
| 36 |
|
| 37 |
def retrieve_data_for_orchestration(query):
|
| 38 |
+
query_embedding = np.array(embedding_model.embed_query(str(query))).reshape(1, -1).astype('float32')
|
|
|
|
|
|
|
| 39 |
faiss.normalize_L2(query_embedding)
|
| 40 |
|
|
|
|
| 41 |
top_k = len(df)
|
| 42 |
distances, indices = index.search(query_embedding, top_k)
|
| 43 |
|
| 44 |
+
similarity_threshold = 0.35
|
|
|
|
| 45 |
selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]
|
| 46 |
|
| 47 |
if not selected:
|
| 48 |
+
return "No influencers found."
|
| 49 |
|
| 50 |
# === Format results ===
|
| 51 |
outer_list = []
|
|
|
|
| 54 |
res = {
|
| 55 |
'rank': rank,
|
| 56 |
'username': row['username'],
|
|
|
|
| 57 |
'visible_text_or_brandings': row['visible_texts_or_brandings'],
|
| 58 |
'likesCount': row['likesCount'],
|
| 59 |
'commentCount': row['commentCount'],
|
|
|
|
| 62 |
|
| 63 |
inner_list = [
|
| 64 |
f"[{res['rank']}]. The influencer name is: **{res['username']}** — Likes: **{res['likesCount']}**, Comments: **{res['commentCount']}**",
|
|
|
|
| 65 |
f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
|
| 66 |
f"The details of product or service is:\n{res['product_or_service_details']}"
|
| 67 |
]
|
| 68 |
outer_list.append(inner_list)
|
| 69 |
|
| 70 |
+
cleaned_response = clean_text(str(outer_list))
|
| 71 |
+
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
|
| 72 |
+
tokens = encoding.encode(cleaned_response)
|
| 73 |
+
trimmed_response = tokens[:1000]
|
| 74 |
+
return encoding.decode(trimmed_response)
|
| 75 |
+
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
|
orchestration_agent/utils/utils.py
CHANGED
|
@@ -7,6 +7,7 @@ from .prompts import captioning_prompt
|
|
| 7 |
from utils.models_loader import llm
|
| 8 |
from langchain_core.messages import FunctionMessage , AIMessage
|
| 9 |
from .tools import retrieve_data_for_analytics
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def caption_image(image_base64,user_input):
|
|
@@ -61,3 +62,5 @@ def extract_latest_response_block(response):
|
|
| 61 |
temp_block = []
|
| 62 |
print('The latest block', latest_block)
|
| 63 |
return latest_block
|
|
|
|
|
|
|
|
|
| 7 |
from utils.models_loader import llm
|
| 8 |
from langchain_core.messages import FunctionMessage , AIMessage
|
| 9 |
from .tools import retrieve_data_for_analytics
|
| 10 |
+
import re
|
| 11 |
|
| 12 |
|
| 13 |
def caption_image(image_base64,user_input):
|
|
|
|
| 62 |
temp_block = []
|
| 63 |
print('The latest block', latest_block)
|
| 64 |
return latest_block
|
| 65 |
+
|
| 66 |
+
|
requirements.txt
CHANGED
|
@@ -18,4 +18,7 @@ langchain_openai
|
|
| 18 |
nltk
|
| 19 |
scikit-learn
|
| 20 |
pandas
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
| 18 |
nltk
|
| 19 |
scikit-learn
|
| 20 |
pandas
|
| 21 |
+
langchain-community
|
| 22 |
+
tiktoken
|
| 23 |
+
langchain-anthropic
|
| 24 |
|
utils/__init__.py
ADDED
|
File without changes
|
utils/__pycache__/data_loader.cpython-312.pyc
CHANGED
|
Binary files a/utils/__pycache__/data_loader.cpython-312.pyc and b/utils/__pycache__/data_loader.cpython-312.pyc differ
|
|
|
utils/__pycache__/models_loader.cpython-312.pyc
CHANGED
|
Binary files a/utils/__pycache__/models_loader.cpython-312.pyc and b/utils/__pycache__/models_loader.cpython-312.pyc differ
|
|
|
utils/data_loader.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
|
|
|
|
| 2 |
from datasets import load_dataset
|
| 3 |
-
dataset = load_dataset("subashdvorak/tiktok-formatted-story-v2", revision="embedded")
|
| 4 |
-
data = dataset['train'].add_faiss_index('embeddings')
|
| 5 |
|
| 6 |
def load_influencer_data():
|
| 7 |
-
return
|
|
|
|
| 8 |
|
| 9 |
|
|
|
|
| 1 |
|
| 2 |
+
print('Loading Dataset..................')
|
| 3 |
from datasets import load_dataset
|
| 4 |
+
# dataset = load_dataset("subashdvorak/tiktok-formatted-story-v2", revision="embedded")
|
| 5 |
+
# data = dataset['train'].add_faiss_index('embeddings')
|
| 6 |
|
| 7 |
def load_influencer_data():
|
| 8 |
+
return 'Some error occouring'
|
| 9 |
+
print('Dataset loaded.................')
|
| 10 |
|
| 11 |
|
utils/load_embeddings.py
CHANGED
|
@@ -2,10 +2,13 @@ import numpy as np
|
|
| 2 |
import ast
|
| 3 |
import faiss
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def load_index_once():
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
| 10 |
embeddings = np.vstack(df['embeddings'].values).astype('float32')
|
| 11 |
faiss.normalize_L2(embeddings)
|
|
|
|
| 2 |
import ast
|
| 3 |
import faiss
|
| 4 |
import pandas as pd
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
|
| 7 |
|
| 8 |
def load_index_once():
|
| 9 |
+
dataset = load_dataset("DvorakInnovationAI/rt-genai-dataset-v1", revision="openai-embeddings")
|
| 10 |
+
df = dataset["train"]
|
| 11 |
+
df= df.to_pandas()
|
| 12 |
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
| 13 |
embeddings = np.vstack(df['embeddings'].values).astype('float32')
|
| 14 |
faiss.normalize_L2(embeddings)
|
utils/models_loader.py
CHANGED
|
@@ -1,28 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from langchain_groq import ChatGroq
|
| 2 |
from langchain_openai import ChatOpenAI
|
| 3 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 4 |
-
|
| 5 |
-
from
|
| 6 |
-
# from huggingface_hub import InferenceClient
|
| 7 |
from huggingface_hub import login
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
load_dotenv()
|
| 10 |
-
import os
|
| 11 |
-
import requests
|
| 12 |
-
import numpy as np
|
| 13 |
-
# from langchain_huggingface import HuggingFaceEndpoint
|
| 14 |
os.environ['HUGGINGFACEHUB_ACCESS_TOKEN']=os.getenv('HUGGINGFACEHUB_ACCESS_TOKEN')
|
| 15 |
login(os.environ['HUGGINGFACEHUB_ACCESS_TOKEN'])
|
| 16 |
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
llm_gemini = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
|
| 21 |
|
| 22 |
llm = ChatGroq(
|
| 23 |
model="llama-3.1-8b-instant",
|
| 24 |
temperature=0.7,
|
| 25 |
-
|
| 26 |
)
|
| 27 |
|
| 28 |
llm_gpt = ChatOpenAI(
|
|
@@ -56,20 +53,10 @@ ST = HFEmbeddingAPI(
|
|
| 56 |
token=os.environ.get('HUGGINGFACEHUB_ACCESS_TOKEN')
|
| 57 |
)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
improver_llm = ChatOpenAI(
|
| 64 |
-
model="gpt-4o-mini",
|
| 65 |
-
temperature=0.7,
|
| 66 |
-
max_tokens=500,
|
| 67 |
-
)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
ideator_llm = llm
|
| 71 |
-
critic_llm = llm
|
| 72 |
-
validator_llm = llm
|
| 73 |
|
| 74 |
|
| 75 |
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import numpy as np
|
| 4 |
from langchain_groq import ChatGroq
|
| 5 |
from langchain_openai import ChatOpenAI
|
| 6 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 7 |
+
from langchain_anthropic import ChatAnthropic
|
| 8 |
+
from langchain_openai import OpenAIEmbeddings
|
|
|
|
| 9 |
from huggingface_hub import login
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
os.environ['HUGGINGFACEHUB_ACCESS_TOKEN']=os.getenv('HUGGINGFACEHUB_ACCESS_TOKEN')
|
| 13 |
login(os.environ['HUGGINGFACEHUB_ACCESS_TOKEN'])
|
| 14 |
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
|
| 15 |
|
| 16 |
+
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=1536)
|
| 17 |
+
llm_anthropic = ChatAnthropic(model='claude-3-5-sonnet-20241022', temperature=0.7, max_tokens=500)
|
| 18 |
llm_gemini = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
|
| 19 |
|
| 20 |
llm = ChatGroq(
|
| 21 |
model="llama-3.1-8b-instant",
|
| 22 |
temperature=0.7,
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
llm_gpt = ChatOpenAI(
|
|
|
|
| 53 |
token=os.environ.get('HUGGINGFACEHUB_ACCESS_TOKEN')
|
| 54 |
)
|
| 55 |
|
| 56 |
+
improver_llm = llm_anthropic
|
| 57 |
+
ideator_llm = llm_anthropic
|
| 58 |
+
critic_llm = llm_anthropic
|
| 59 |
+
validator_llm = llm_anthropic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
|
utils/utils.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
def clean_text(text: str) -> str:
|
| 3 |
+
"""
|
| 4 |
+
General-purpose text cleaner for LLMs or downstream NLP tasks.
|
| 5 |
+
Removes special characters, escape sequences, excess spaces, and normalizes punctuation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
if not isinstance(text, str):
|
| 9 |
+
return ""
|
| 10 |
+
|
| 11 |
+
# Normalize encoded newlines and tabs
|
| 12 |
+
text = text.replace("\\n", "\n").replace("\\t", " ")
|
| 13 |
+
|
| 14 |
+
# Remove stray backslashes (\\), unless part of newline
|
| 15 |
+
text = re.sub(r"\\(?!n)", '', text)
|
| 16 |
+
|
| 17 |
+
# Remove brackets often used for metadata or markup
|
| 18 |
+
text = re.sub(r'[\[\]{}<>]', '', text)
|
| 19 |
+
|
| 20 |
+
# Remove quotes
|
| 21 |
+
text = re.sub(r"[\"']", '', text)
|
| 22 |
+
|
| 23 |
+
# Remove special characters except basic punctuation (.,!?)
|
| 24 |
+
text = re.sub(r"[^a-zA-Z0-9.,!? \n]", '', text)
|
| 25 |
+
|
| 26 |
+
# Remove repeated punctuation like "!!!" or "???"
|
| 27 |
+
text = re.sub(r'([!?.,]){2,}', r'\1', text)
|
| 28 |
+
|
| 29 |
+
# Normalize multiple spaces and newlines
|
| 30 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 31 |
+
text = re.sub(r'\n{3,}', '\n\n', text) # Collapse more than 2 newlines to just 2
|
| 32 |
+
text = re.sub(r' *\n *', '\n', text) # Clean spaces around newlines
|
| 33 |
+
|
| 34 |
+
return text.strip()
|