Sanchayt commited on
Commit
e95a5b9
·
1 Parent(s): 2b367d7

Add application file

Browse files
Files changed (1) hide show
  1. app.py +194 -0
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import streamlit as st
4
+ from langchain import PromptTemplate
5
+ from langchain.agents import initialize_agent, Tool
6
+ from langchain.agents import AgentType
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.prompts import MessagesPlaceholder
9
+ from langchain.memory import ConversationSummaryBufferMemory
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.chains.summarize import load_summarize_chain
12
+ from langchain.tools import BaseTool
13
+ from pydantic import BaseModel, Field
14
+ from typing import Type
15
+ from bs4 import BeautifulSoup
16
+ import requests
17
+ import json
18
+ from langchain.schema import SystemMessage
19
+ # from fastapi import FastAPI
20
+
21
+ load_dotenv()
22
+ brwoserless_api_key = os.getenv("BROWSERLESS_API_KEY")
23
+ serper_api_key = os.getenv("SERP_API_KEY")
24
+ open_ai_api = os.getenv("OPENAI_API_KEY")
25
+
26
+
27
+ # 1. Tool for search
28
+
29
+
30
+ def search(query):
31
+ url = "https://google.serper.dev/search"
32
+
33
+ payload = json.dumps({
34
+ "q": query
35
+ })
36
+
37
+ headers = {
38
+ 'X-API-KEY': serper_api_key,
39
+ 'Content-Type': 'application/json'
40
+ }
41
+
42
+ response = requests.request("POST", url, headers=headers, data=payload)
43
+
44
+ print(response.text)
45
+
46
+ return response.text
47
+
48
+
49
+ # 2. Tool for scraping
50
+ def scrape_website(objective: str, url: str):
51
+ # scrape website, and also will summarize the content based on objective if the content is too large
52
+ # objective is the original objective & task that user give to the agent, url is the url of the website to be scraped
53
+
54
+ print("Scraping website...")
55
+ # Define the headers for the request
56
+ headers = {
57
+ 'Cache-Control': 'no-cache',
58
+ 'Content-Type': 'application/json',
59
+ }
60
+
61
+ # Define the data to be sent in the request
62
+ data = {
63
+ "url": url
64
+ }
65
+
66
+ # Convert Python object to JSON string
67
+ data_json = json.dumps(data)
68
+
69
+ # Send the POST request
70
+ post_url = f"https://chrome.browserless.io/content?token={brwoserless_api_key}"
71
+ response = requests.post(post_url, headers=headers, data=data_json)
72
+
73
+ # Check the response status code
74
+ if response.status_code == 200:
75
+ soup = BeautifulSoup(response.content, "html.parser")
76
+ text = soup.get_text()
77
+ print("CONTENTTTTTT:", text)
78
+
79
+ if len(text) > 10000:
80
+ output = summary(objective, text)
81
+ return output
82
+ else:
83
+ return text
84
+ else:
85
+ print(f"HTTP request failed with status code {response.status_code}")
86
+
87
+
88
+ def summary(objective, content):
89
+ llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
90
+
91
+ text_splitter = RecursiveCharacterTextSplitter(
92
+ separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
93
+ docs = text_splitter.create_documents([content])
94
+ map_prompt = """
95
+ Write a summary of the following text for {objective}:
96
+ "{text}"
97
+ SUMMARY:
98
+ """
99
+ map_prompt_template = PromptTemplate(
100
+ template=map_prompt, input_variables=["text", "objective"])
101
+
102
+ summary_chain = load_summarize_chain(
103
+ llm=llm,
104
+ chain_type='map_reduce',
105
+ map_prompt=map_prompt_template,
106
+ combine_prompt=map_prompt_template,
107
+ verbose=True
108
+ )
109
+
110
+ output = summary_chain.run(input_documents=docs, objective=objective)
111
+
112
+ return output
113
+
114
+
115
+ class ScrapeWebsiteInput(BaseModel):
116
+ """Inputs for scrape_website"""
117
+ objective: str = Field(
118
+ description="The objective & task that users give to the agent")
119
+ url: str = Field(description="The url of the website to be scraped")
120
+
121
+
122
+ class ScrapeWebsiteTool(BaseTool):
123
+ name = "scrape_website"
124
+ description = "useful when you need to get data from a website url, passing both url and objective to the function; DO NOT make up any url, the url should only be from the search results"
125
+ args_schema: Type[BaseModel] = ScrapeWebsiteInput
126
+
127
+ def _run(self, objective: str, url: str):
128
+ return scrape_website(objective, url)
129
+
130
+ def _arun(self, url: str):
131
+ raise NotImplementedError("error here")
132
+
133
+
134
+ # 3. Create langchain agent with the tools above
135
+ tools = [
136
+ Tool(
137
+ name="Search",
138
+ func=search,
139
+ description="useful for when you need to answer questions about current events, data. You should ask targeted questions"
140
+ ),
141
+ ScrapeWebsiteTool(),
142
+ ]
143
+
144
+ system_message = SystemMessage(
145
+ content="""You are a world class researcher, who can do detailed research on any topic and produce facts based results;
146
+ you do not make things up, you will try as hard as possible to gather facts & data to back up the research
147
+
148
+ Please make sure you complete the objective above with the following rules:
149
+ 1/ You should do enough research to gather as much information as possible about the objective
150
+ 2/ If there are url of relevant links & articles, you will scrape it to gather more information
151
+ 3/ After scraping & search, you should think "is there any new things i should search & scraping based on the data I collected to increase research quality?" If answer is yes, continue; But don't do this more than 3 iteratins
152
+ 4/ You should not make things up, you should only write facts & data that you have gathered
153
+ 5/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research
154
+ 6/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research"""
155
+ )
156
+
157
+ agent_kwargs = {
158
+ "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
159
+ "system_message": system_message,
160
+ }
161
+
162
+ llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
163
+ memory = ConversationSummaryBufferMemory(
164
+ memory_key="memory", return_messages=True, llm=llm, max_token_limit=1000)
165
+
166
+ agent = initialize_agent(
167
+ tools,
168
+ llm,
169
+ agent=AgentType.OPENAI_FUNCTIONS,
170
+ verbose=True,
171
+ agent_kwargs=agent_kwargs,
172
+ memory=memory,
173
+ )
174
+
175
+
176
+ def main():
177
+ st.set_page_config(page_title="AI research agent", page_icon=":bird:")
178
+
179
+ st.header("AI research agent :bird:")
180
+ query = st.text_input("Research goal")
181
+
182
+ if query:
183
+ st.write("Doing research for ", query)
184
+
185
+ result = agent({"input": query})
186
+
187
+ st.info(result['output'])
188
+
189
+
190
+ if __name__ == '__main__':
191
+ main()
192
+
193
+
194
+