rairo commited on
Commit
6032808
·
verified ·
1 Parent(s): a1b4ab9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -158
app.py CHANGED
@@ -18,183 +18,202 @@ import urllib.parse
18
  subprocess.run(["playwright", "install"])
19
  nest_asyncio.apply()
20
 
21
- GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
22
 
23
  graph_config = {
24
-     "llm": {
25
-         "api_key": GOOGLE_API_KEY,
26
-         "model": "google_genai/gemini-pro",
27
-     },
28
  }
29
 
 
30
  def get_data(url):
31
-     smart_scraper_graph = SmartScraperGraph(
32
-         prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries, sector and eligibility criteria for applicants.",
33
-         source=url,
34
-         config=graph_config
35
-     )
36
-     return smart_scraper_graph.run()
 
 
 
 
 
37
 
38
  def process_multiple_urls(urls):
39
-     """
40
-     Process multiple URLs with progress tracking
41
-     """
42
-     all_data = {"grants": []}
43
-     progress_bar = st.progress(0)
44
-     status_container = st.empty()
45
-     
46
-     total_urls = len(urls)
47
-     for index, url in enumerate(urls):
48
-         try:
49
-             url = url.strip()
50
-             if not url:
51
-                 continue
52
-                 
53
-             # Update progress
54
-             progress = (index + 1) / total_urls
55
-             progress_bar.progress(progress)
56
-             
57
-             # Show current status
58
-             status_container.markdown(f"""
59
-             **Processing URL {index+1} of {total_urls}**  
60
-             🔍 Scanning: `{url}`  
61
-             ✅ Completed: {index}/{total_urls}  
62
-             ⏳ Remaining: {total_urls - index - 1}
63
-             """)
64
-             
65
-             # Scrape data
66
-             result = get_data(url)
67
-             if result and 'grants' in result:
68
-                 all_data['grants'].extend(result['grants'])
69
-                 
70
-         except Exception as e:
71
-             st.error(f"Error processing {url}: {str(e)}")
72
-             continue
73
-             
74
-     progress_bar.empty()
75
-     status_container.empty()
76
-     return all_data
 
 
77
 
78
  def convert_to_csv(data):
79
-     df = pd.DataFrame(data['grants'])
80
-     return df.to_csv(index=False).encode('utf-8')
 
81
 
82
  def convert_to_excel(data):
83
-     df = pd.DataFrame(data['grants'])
84
-     buffer = io.BytesIO()
85
-     with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
86
-         df.to_excel(writer, sheet_name='Grants', index=False)
87
-     return buffer.getvalue()
 
88
 
89
  def create_knowledge_base(data):
90
-     documents = []
91
-     for grant in data['grants']:
92
-         doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
93
-         documents.append("\n".join(doc_parts))
 
 
 
94
 
95
-     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
96
-     texts = text_splitter.create_documents(documents)
97
 
98
-     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
99
-     vectorstore = FAISS.from_documents(texts, embeddings)
 
 
 
100
 
101
-     llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0)
102
-     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
103
-     return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
104
 
105
  def get_shareable_link(file_data, file_name, file_type):
106
-     b64 = base64.b64encode(file_data).decode()
107
-     return f"data:{file_type};base64,{b64}"
 
108
 
109
  def main():
110
-     st.sidebar.title("Quantilytix Grant Scraper")
111
-     st.sidebar.image("logoqb.jpeg", use_container_width=True)
112
-
113
-     # Initialize session state for scraped data and chat history if not already present
114
-     if "scraped_data" not in st.session_state:
115
-         st.session_state.scraped_data = None
116
-     if "chat_history" not in st.session_state:
117
-         st.session_state.chat_history = []
118
-     if "chat_interface_active" not in st.session_state:
119
-         st.session_state.chat_interface_active = False
120
-
121
-     # Multi-url input
122
-     url_input = st.sidebar.text_area(
123
-         "Enter URLs (one per line)",
124
-         height=150,
125
-         help="Enter multiple URLs separated by new lines"
126
-     )
127
-
128
-
129
-     if st.sidebar.button("Get grants"):
130
-         if url_input:
131
-             urls = [url.strip() for url in url_input.split('\n') if url.strip()]
132
-             if urls:
133
-                 try:
134
-                     with st.spinner("Starting scraping process..."):
135
-                         result = process_multiple_urls(urls)
136
-                         st.session_state.scraped_data = result
137
-                         st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
138
-                 except Exception as e:
139
-                     st.error(f"Error in scraping process: {e}")
140
-             else:
141
-                 st.warning("Please enter valid URLs.")
142
-         else:
143
-             st.warning("Please enter at least one URL.")
144
-
145
-     if st.session_state.scraped_data:
146
-         selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
147
-         result = st.session_state.scraped_data
148
-
149
-         if selected_format == "CSV":
150
-             file_data = convert_to_csv(result)
151
-             file_name = "grants.csv"
152
-             file_type = "text/csv"
153
-         else:
154
-             file_data = convert_to_excel(result)
155
-             file_name = "grants.xlsx"
156
-             file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
157
-
158
-         b64 = base64.b64encode(file_data).decode()
159
-         download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
160
-         st.sidebar.markdown(download_link, unsafe_allow_html=True)
161
-
162
-         shareable_link = get_shareable_link(file_data, file_name, file_type)
163
-         st.sidebar.markdown("---")
164
-         st.sidebar.markdown("**Share Options:**")
165
-         
166
-         whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
167
-         st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
168
-
169
-         email_subject = urllib.parse.quote("Check out this grants file")
170
-         email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
171
-         email_url = f"mailto:?subject={email_subject}&body={email_body}"
172
-         st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
173
-
174
-         with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
175
-             st.dataframe(result['grants'])
176
-
177
-         if st.sidebar.button("Load data as Knowledge Base"):
178
-             st.session_state.qa_chain = create_knowledge_base(result)
179
-             st.session_state.chat_interface_active = True
180
-             st.session_state.chat_history = [] # Initialize chat_history here when KB is loaded
181
-
182
-     if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
183
-         st.header("Chat Interface Loaded. Start asking questions about the grants!")
184
-         st.image("logoqb.jpeg", width=150)
185
-         query = st.text_input("Ask a question about the grants:", key="chat_input")
186
-         
187
-         if query:
188
-             if st.session_state.qa_chain:
189
-                 response = st.session_state.qa_chain({"question": query})
190
-                 st.session_state.chat_history.append({"query": query, "response": response['answer']})
191
-             else:
192
-                 st.error("Knowledge base not loaded. Please load the knowledge base first.")
193
-
194
-         if "chat_history" in st.session_state: # Check if chat_history exists before iterating
195
-             for chat in st.session_state.chat_history:
196
-                 st.markdown(f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>", unsafe_allow_html=True)
197
-                 st.markdown(f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>", unsafe_allow_html=True)
 
 
 
 
 
198
 
199
  if __name__ == "__main__":
200
-     main()
 
18
  subprocess.run(["playwright", "install"])
19
  nest_asyncio.apply()
20
 
21
+ GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
22
 
23
  graph_config = {
24
+ "llm": {
25
+ "api_key": GOOGLE_API_KEY,
26
+ "model": "google_genai/gemini-pro",
27
+ },
28
  }
29
 
30
+
31
  def get_data(url):
32
+ smart_scraper_graph = SmartScraperGraph(
33
+ prompt=(
34
+ "List me all grants or funds, short summary of grant description, "
35
+ "the organisations funding them, the value of the grant as an integer, "
36
+ "the due date, eligible countries, sector and eligibility criteria for applicants."
37
+ ),
38
+ source=url,
39
+ config=graph_config,
40
+ )
41
+ return smart_scraper_graph.run()
42
+
43
 
44
  def process_multiple_urls(urls):
45
+ """
46
+ Process multiple URLs with progress tracking
47
+ """
48
+ all_data = {"grants": []}
49
+ progress_bar = st.progress(0)
50
+ status_container = st.empty()
51
+
52
+ total_urls = len(urls)
53
+ for index, url in enumerate(urls):
54
+ try:
55
+ url = url.strip()
56
+ if not url:
57
+ continue
58
+
59
+ # Update progress
60
+ progress = (index + 1) / total_urls
61
+ progress_bar.progress(progress)
62
+
63
+ # Show current status
64
+ status_container.markdown(
65
+ f"""
66
+ **Processing URL {index+1} of {total_urls}**
67
+ 🔍 Scanning: `{url}`
68
+ Completed: {index}/{total_urls}
69
+ ⏳ Remaining: {total_urls - index - 1}
70
+ """
71
+ )
72
+
73
+ # Scrape data
74
+ result = get_data(url)
75
+ if result and "grants" in result:
76
+ all_data["grants"].extend(result["grants"])
77
+ except Exception as e:
78
+ st.error(f"Error processing {url}: {str(e)}")
79
+ continue
80
+
81
+ progress_bar.empty()
82
+ status_container.empty()
83
+ return all_data
84
+
85
 
86
  def convert_to_csv(data):
87
+ df = pd.DataFrame(data["grants"])
88
+ return df.to_csv(index=False).encode("utf-8")
89
+
90
 
91
  def convert_to_excel(data):
92
+ df = pd.DataFrame(data["grants"])
93
+ buffer = io.BytesIO()
94
+ with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
95
+ df.to_excel(writer, sheet_name="Grants", index=False)
96
+ return buffer.getvalue()
97
+
98
 
99
  def create_knowledge_base(data):
100
+ documents = []
101
+ for grant in data["grants"]:
102
+ doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
103
+ documents.append("\n".join(doc_parts))
104
+
105
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
106
+ texts = text_splitter.create_documents(documents)
107
 
108
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
109
+ vectorstore = FAISS.from_documents(texts, embeddings)
110
 
111
+ llm = ChatGoogleGenerativeAI(
112
+ model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0
113
+ )
114
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
115
+ return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
116
 
 
 
 
117
 
118
  def get_shareable_link(file_data, file_name, file_type):
119
+ b64 = base64.b64encode(file_data).decode()
120
+ return f"data:{file_type};base64,{b64}"
121
+
122
 
123
  def main():
124
+ st.sidebar.title("Quantilytix Grant Scraper")
125
+ st.sidebar.image("logoqb.jpeg", use_container_width=True)
126
+
127
+ # Initialize session state for scraped data and chat history if not already present
128
+ if "scraped_data" not in st.session_state:
129
+ st.session_state.scraped_data = None
130
+ if "chat_history" not in st.session_state:
131
+ st.session_state.chat_history = []
132
+ if "chat_interface_active" not in st.session_state:
133
+ st.session_state.chat_interface_active = False
134
+
135
+ # Multi-URL input
136
+ url_input = st.sidebar.text_area(
137
+ "Enter URLs (one per line)",
138
+ height=150,
139
+ help="Enter multiple URLs separated by new lines",
140
+ )
141
+
142
+ if st.sidebar.button("Get grants"):
143
+ if url_input:
144
+ urls = [url.strip() for url in url_input.split("\n") if url.strip()]
145
+ if urls:
146
+ try:
147
+ with st.spinner("Starting scraping process..."):
148
+ result = process_multiple_urls(urls)
149
+ st.session_state.scraped_data = result
150
+ st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
151
+ except Exception as e:
152
+ st.error(f"Error in scraping process: {e}")
153
+ else:
154
+ st.warning("Please enter valid URLs.")
155
+ else:
156
+ st.warning("Please enter at least one URL.")
157
+
158
+ if st.session_state.scraped_data:
159
+ selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
160
+ result = st.session_state.scraped_data
161
+
162
+ if selected_format == "CSV":
163
+ file_data = convert_to_csv(result)
164
+ file_name = "grants.csv"
165
+ file_type = "text/csv"
166
+ else:
167
+ file_data = convert_to_excel(result)
168
+ file_name = "grants.xlsx"
169
+ file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
170
+
171
+ b64 = base64.b64encode(file_data).decode()
172
+ download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
173
+ st.sidebar.markdown(download_link, unsafe_allow_html=True)
174
+
175
+ shareable_link = get_shareable_link(file_data, file_name, file_type)
176
+ st.sidebar.markdown("---")
177
+ st.sidebar.markdown("**Share Options:**")
178
+
179
+ whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
180
+ st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
181
+
182
+ email_subject = urllib.parse.quote("Check out this grants file")
183
+ email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
184
+ email_url = f"mailto:?subject={email_subject}&body={email_body}"
185
+ st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
186
+
187
+ with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
188
+ st.dataframe(result["grants"])
189
+
190
+ if st.sidebar.button("Load data as Knowledge Base"):
191
+ st.session_state.qa_chain = create_knowledge_base(result)
192
+ st.session_state.chat_interface_active = True
193
+ st.session_state.chat_history = [] # Initialize chat_history when KB is loaded
194
+
195
+ if st.session_state.get("chat_interface_active"):
196
+ st.header("Chat Interface Loaded. Start asking questions about the grants!")
197
+ st.image("logoqb.jpeg", width=150)
198
+ query = st.text_input("Ask a question about the grants:", key="chat_input")
199
+
200
+ if query:
201
+ if st.session_state.qa_chain:
202
+ response = st.session_state.qa_chain({"question": query})
203
+ st.session_state.chat_history.append({"query": query, "response": response["answer"]})
204
+ else:
205
+ st.error("Knowledge base not loaded. Please load the knowledge base first.")
206
+
207
+ for chat in st.session_state.get("chat_history", []):
208
+ st.markdown(
209
+ f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>",
210
+ unsafe_allow_html=True,
211
+ )
212
+ st.markdown(
213
+ f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>",
214
+ unsafe_allow_html=True,
215
+ )
216
+
217
 
218
  if __name__ == "__main__":
219
+ main()