lenpanda commited on
Commit
d0537bb
·
verified ·
1 Parent(s): 4db6ae4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +397 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain_core.tools import tool
4
+ from pydantic import BaseModel
5
+ from typing_extensions import TypedDict,Annotated
6
+ import operator
7
+ from langchain_core.messages import AnyMessage
8
+
9
+ import re
10
+ import ast
11
+ import time
12
+ from langchain_chroma import Chroma
13
+ from langchain_tavily import TavilySearch
14
+ from langchain_core.messages import AIMessage,HumanMessage
15
+ from langgraph.graph import StateGraph,START,MessagesState,END
16
+ from langchain_core.messages import HumanMessage,AIMessage
17
+ from langgraph.graph import StateGraph,START,MessagesState,END
18
+ from langchain_core.messages import HumanMessage,AIMessage
19
+ from langchain_core.documents import Document
20
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
21
+ import chromadb
22
+
23
+ from langchain_google_genai import ChatGoogleGenerativeAI
24
+ from langchain_core.messages import SystemMessage,HumanMessage
25
+
26
+ from langgraph.prebuilt import create_react_agent
27
+
28
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
29
+
30
+ import streamlit as st
31
+
32
+ import logging
33
+
34
+
35
+ # logging
36
+ logger = logging.getLogger("runs_logger")
37
+ logger.setLevel(logging.INFO)
38
+
39
+ if not logger.handlers:
40
+ file_handler = logging.FileHandler("./running_logs.log", mode="a")
41
+ file_handler.setLevel(logging.INFO)
42
+
43
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
44
+ file_handler.setFormatter(formatter)
45
+
46
+ logger.addHandler(file_handler)
47
+
48
+
49
+
50
+
51
+
52
+ # tavily api key import
53
+ tavily_api_key = os.getenv("TAVILY_API_KEY")
54
+
55
+ # tavily gemini import
56
+ gemini_api_key = os.getenv("GOOGLE_API_KEY")
57
+
58
+
59
+ # Embeddings model to embed the results to store in vector db
60
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004",google_api_key=gemini_api_key)
61
+
62
+
63
+ # tavily search initialization
64
+ tavily_search = TavilySearch(max_results=1, api_key=tavily_api_key,topic="general",include_raw_content=True)
65
+
66
+
67
+
68
+
69
+
70
+
71
+ llm = ChatGoogleGenerativeAI(
72
+ model="gemini-1.5-flash",
73
+ api_key=gemini_api_key
74
+
75
+ )
76
+
77
+
78
+
79
+ # state initilization to store messages
80
+ class State(TypedDict):
81
+
82
+ messages: Annotated[list[AnyMessage],operator.add]
83
+ # running_summary:str = field(default=None)
84
+ title: Annotated[list,operator.add]
85
+
86
+
87
+
88
+ format = {
89
+ "subtopics": [
90
+ {
91
+ "title": "Subtopic Title",
92
+ "search_queries": ["query1", "query2"]
93
+ }
94
+ ]
95
+ }
96
+
97
+ prompt = f"""
98
+ You are a deep research expert. Your job is to break a broad topic into several detailed subtopics.
99
+ For each subtopic, provide a maximum of **four** web search queries that can help collect relevant data.
100
+
101
+ Your output must strictly follow this JSON-like format:
102
+ {format}
103
+
104
+ Example:
105
+ If the topic is "climate change", one subtopic might be "effects on agriculture", and search queries could be:
106
+ ["impact of climate change on agriculture", "climate change and crop yields"]
107
+
108
+ Goal: These search queries will be used to gather web data for generating a detailed report.
109
+
110
+ Now generate subtopics and search queries for the topic: "{{topic}}"
111
+ """
112
+
113
+
114
+ # agent to create subtopics and its related search queries
115
+ query_generator_agent = create_react_agent(llm,tools=[],prompt=prompt)
116
+
117
+
118
+
119
+
120
+ chromadb.api.client.SharedSystemClient.clear_system_cache()
121
+
122
+
123
+ vector_db = Chroma(collection_name="research_data_2", embedding_function=embeddings)
124
+
125
+
126
+
127
+
128
+
129
+ # function to add raw content from tavily search to vector db
130
+ def add_to_vectorDB(doc):
131
+ if not doc:
132
+ return False
133
+
134
+ try:
135
+ logger.log(logging.INFO,f"Adding document to vector DB: {doc.metadata.get('title', 'No title')}")
136
+
137
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
138
+ splits = text_splitter.split_documents([doc])
139
+ logger.log(logging.INFO,f"Split into {len(splits)} chunks")
140
+ vector_db.add_documents(splits)
141
+ logger.log(logging.INFO,f"Successfully added document to vector DB")
142
+ return True
143
+
144
+ except Exception as e:
145
+ logger.log(logging.INFO,f"Error adding document to vector DB: {e}")
146
+
147
+ return False
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+ def web_search(state:State):
156
+ """
157
+ Uses the latest message to extract subtopics and web searches, then adds raw content to the vector database
158
+ """
159
+
160
+ last_message = state['messages'][-1]
161
+ pattern = r"```json\s*(.*?)\s*```"
162
+
163
+ if isinstance(last_message,AIMessage):
164
+ message_content = last_message.content
165
+
166
+ logger.log(logging.INFO,"starting pattern search ")
167
+ subtopics_dict = re.search(pattern,message_content,re.DOTALL)
168
+ logger.log(logging.INFO,f"found pattern {subtopics_dict}")
169
+
170
+ if subtopics_dict:
171
+ result = subtopics_dict.group(1)
172
+
173
+ result = ast.literal_eval(result) # using this as the regex returned a str as outptut
174
+
175
+ for i,content in enumerate(result['subtopics']):
176
+
177
+ title = content.get("title")
178
+
179
+ if title:
180
+ metadata = {"title":title}
181
+ state['title'].append(title)
182
+ else:
183
+ metadata = {"title":"no title"}
184
+
185
+ for query in content.get('search_queries',"no search query"):
186
+
187
+
188
+ logger.log(logging.INFO,f"starting search for search {i}, query: {query}")
189
+
190
+ try:
191
+ search_result = tavily_search.invoke({"query":query})
192
+
193
+ if search_result:
194
+ logger.log(logging.INFO,f"found search result {i}")
195
+
196
+ raw_content = search_result["results"][0].get("raw_content","No content")
197
+
198
+ if raw_content:
199
+ raw_content.replace("\n","")
200
+
201
+ docs = Document(page_content=raw_content,metadata=metadata) # making a Document as it acts as input to add_to_vectorDB function
202
+
203
+ add_to_vectorDB(docs)
204
+
205
+ else:
206
+ logger.log(logging.INFO,f"no raw content found for search {i}")
207
+
208
+ except Exception as e:
209
+ logger.log(logging.ERROR,f"unable to perform search, {e}")
210
+ return State['messages'].append(AIMessage(content=f"unable to perform search, error:{e}"))
211
+
212
+
213
+
214
+ logger.log(logging.INFO,"sleeping for 6 seconds")
215
+ time.sleep(6)
216
+
217
+ else:
218
+ return State['messages'].append(AIMessage(content="unable to extract subtopics"))
219
+
220
+ else:
221
+ return state['messages'].append(AIMessage(content="no AI message in messages"))
222
+
223
+ try:
224
+ db_size = len(vector_db.get()['documents'])
225
+ result_text = f"added {db_size} elements to vector db"
226
+
227
+
228
+ except Exception as e:
229
+ result_text = f"error finding size of vector db check if its initilaized {e}"
230
+
231
+
232
+
233
+ return state['messages'].append(AIMessage(content=result_text))
234
+
235
+
236
+
237
+ summarizer_instructions = """
238
+ You are a specialized research assistant responsible for generating detailed, comprehensive research reports based on retrieved documents. Your reports must demonstrate academic rigor, analytical depth, and thorough coverage of all aspects of each topic.
239
+
240
+ REPORT STRUCTURE AND CONTENT REQUIREMENTS:
241
+ For each subject (e.g., historical figure, event, movement, or development), provide:
242
+
243
+ 1. COMPREHENSIVE OVERVIEW (1-2 paragraphs):
244
+ - Clear definition and significance of the subject
245
+ - Temporal and geographical context
246
+ - Brief introduction to key themes that will be explored
247
+
248
+ 2. DETAILED ANALYSIS BY SUBTOPIC:
249
+ Each subtopic should include:
250
+
251
+ ## [Subtopic Title]
252
+
253
+ **Historical Context:**
254
+ - Thorough exploration of preceding events and conditions
255
+ - Cultural, political, and social environment
256
+ - Relevant ideological currents or intellectual foundations
257
+
258
+ **Core Developments:**
259
+ - Chronological progression of key events
260
+ - Critical turning points and catalyst moments
261
+ - Primary sources or documented evidence where applicable
262
+ - Different perspectives or interpretations by scholars
263
+
264
+ **Key Figures and Their Contributions:**
265
+ - Biographical details relevant to their role
266
+ - Specific actions, decisions, or works that proved influential
267
+ - Relationships with other significant actors or institutions
268
+
269
+ **Mechanisms of Change:**
270
+ - Analysis of how and why developments occurred
271
+ - Examination of power structures, resources, or tactical approaches
272
+ - Assessment of resistance or support from different sectors
273
+
274
+ **Short and Long-term Implications:**
275
+ - Immediate effects on contemporaneous systems or populations
276
+ - Lasting legacy and influence on subsequent developments
277
+ - Changes to institutions, laws, cultural practices, or social norms
278
+ - Global or regional ripple effects
279
+
280
+ **Critical Analysis:**
281
+ - Scholarly debates or competing interpretations
282
+ - Methodological considerations in studying this topic
283
+ - Gaps in historical knowledge or contested narratives
284
+
285
+ **Connections to Broader Themes:**
286
+ - Links to major historical processes (e.g., industrialization, globalization)
287
+ - Relationship to theoretical frameworks (e.g., colonialism, nationalism)
288
+ - Comparisons with similar developments in other contexts
289
+
290
+ 3. VISUAL AND ORGANIZATIONAL ELEMENTS:
291
+ - Chronological timelines of key events
292
+ - Hierarchical relationships between actors or institutions
293
+ - Geographic distributions or movements
294
+ - Statistical data presented clearly when relevant
295
+
296
+ 4. CONCLUDING SYNTHESIS:
297
+ - Integration of subtopics into a coherent narrative
298
+ - Assessment of overall historical significance
299
+ - Enduring questions or areas for further research
300
+
301
+ FORMATTING AND STYLE REQUIREMENTS:
302
+ - Use **Markdown** formatting for structure and readability
303
+ - Employ formal academic language while maintaining clarity
304
+ - Include precise dates, locations, and proper names
305
+ - Maintain objective, evidence-based analysis
306
+ - Avoid presentism or anachronistic judgments
307
+ - Use footnotes for clarifications or supplementary information
308
+ - Organize content with clear headers, subheaders, and logical paragraph breaks
309
+ - Include bullet points for lists of events, factors, or components
310
+ - The output capability is limited to text only so dont display images or timelines
311
+
312
+ QUALITY STANDARDS:
313
+ - Prioritize depth over breadth
314
+ - Verify factual accuracy and consistency
315
+ - Address multiple perspectives or interpretations
316
+ - Acknowledge limitations of available evidence
317
+ - Maintain appropriate historical context throughout
318
+ - Ensure logical transitions between sections
319
+ - Avoid oversimplification of complex historical processes
320
+
321
+ The final report should function as a standalone, comprehensive academic resource that could serve as a foundation for further research, teaching materials, or policy analysis.
322
+ """
323
+
324
+
325
+
326
+
327
+
328
+ # summarizing the content based on titles stored in state that is being used to retrieve content from vector DB
329
+ def summarize_the_content(state:State):
330
+
331
+ titles = state['title']
332
+
333
+ full_content = ""
334
+
335
+ for title in titles:
336
+
337
+
338
+ if title:
339
+ full_content += f"title: {title}\n"
340
+ docs = vector_db.similarity_search(title)
341
+
342
+ if docs:
343
+ logger.log(logging.INFO,f"successfully extracted the docs based on title: {title}")
344
+ for doc in docs:
345
+ if isinstance(doc,Document):
346
+ full_content += f"\n{doc.page_content.strip()}\n"
347
+
348
+ else:
349
+ full_content += "\nNo content\n"
350
+
351
+ else:
352
+ logger.log(logging.INFO,f"No docs found for {title}")
353
+
354
+
355
+ summary = llm.invoke([SystemMessage(content=summarizer_instructions),
356
+ HumanMessage(content=full_content)])
357
+
358
+
359
+ state['messages'].append(AIMessage(content=summary.content))
360
+
361
+ return state
362
+
363
+
364
+ # graph initilization
365
+
366
+ workflow = StateGraph(State)
367
+ workflow.add_node("query_generator", query_generator_agent)
368
+ workflow.add_node("web_search", web_search)
369
+ workflow.add_node("summarize",summarize_the_content)
370
+
371
+ workflow.add_edge(START, "query_generator")
372
+ workflow.add_edge("query_generator", "web_search")
373
+ workflow.add_edge("web_search", "summarize")
374
+
375
+ workflow.add_edge("summarize", END)
376
+
377
+
378
+
379
+ graph = workflow.compile()
380
+
381
+
382
+ st.title("Deep research")
383
+
384
+ # taking user input
385
+
386
+ user_input = st.text_input("Enter your topic to deep research")
387
+
388
+ if user_input:
389
+ with st.spinner('Researching your topic... This may take a few minutes'):
390
+
391
+ events = graph.invoke({"messages": [HumanMessage(content=user_input)]})
392
+
393
+ st.success("Research Completed")
394
+ st.markdown(events['messages'][-1].content)
395
+
396
+
397
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain-core
2
+ pydantic
3
+ typing-extensions
4
+ langchain-chroma
5
+ langchain-tavily
6
+ langgraph
7
+ langchain-text-splitters
8
+ chromadb
9
+ langchain-google-genai
10
+ streamlit