rairo commited on
Commit
a1b4ab9
·
verified ·
1 Parent(s): 21f4f5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -149
app.py CHANGED
@@ -21,172 +21,180 @@ nest_asyncio.apply()
21
  GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
22
 
23
  graph_config = {
24
- "llm": {
25
- "api_key": GOOGLE_API_KEY,
26
- "model": "google_genai/gemini-pro",
27
- },
28
  }
29
 
30
  def get_data(url):
31
- smart_scraper_graph = SmartScraperGraph(
32
- prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries, sector and eligibility criteria for applicants.",
33
- source=url,
34
- config=graph_config
35
- )
36
- return smart_scraper_graph.run()
37
 
38
  def process_multiple_urls(urls):
39
- """
40
- Process multiple URLs with progress tracking
41
- """
42
- all_data = {"grants": []}
43
- progress_bar = st.progress(0)
44
- status_container = st.empty()
45
-
46
- total_urls = len(urls)
47
- for index, url in enumerate(urls):
48
- try:
49
- url = url.strip()
50
- if not url:
51
- continue
52
-
53
- # Update progress
54
- progress = (index + 1) / total_urls
55
- progress_bar.progress(progress)
56
-
57
- # Show current status
58
- status_container.markdown(f"""
59
- **Processing URL {index+1} of {total_urls}**
60
- 🔍 Scanning: `{url}`
61
- ✅ Completed: {index}/{total_urls}
62
- ⏳ Remaining: {total_urls - index - 1}
63
- """)
64
-
65
- # Scrape data
66
- result = get_data(url)
67
- if result and 'grants' in result:
68
- all_data['grants'].extend(result['grants'])
69
-
70
- except Exception as e:
71
- st.error(f"Error processing {url}: {str(e)}")
72
- continue
73
-
74
- progress_bar.empty()
75
- status_container.empty()
76
- return all_data
77
 
78
  def convert_to_csv(data):
79
- df = pd.DataFrame(data['grants'])
80
- return df.to_csv(index=False).encode('utf-8')
81
 
82
  def convert_to_excel(data):
83
- df = pd.DataFrame(data['grants'])
84
- buffer = io.BytesIO()
85
- with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
86
- df.to_excel(writer, sheet_name='Grants', index=False)
87
- return buffer.getvalue()
88
 
89
  def create_knowledge_base(data):
90
- documents = []
91
- for grant in data['grants']:
92
- doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
93
- documents.append("\n".join(doc_parts))
94
 
95
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
96
- texts = text_splitter.create_documents(documents)
97
 
98
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
99
- vectorstore = FAISS.from_documents(texts, embeddings)
100
 
101
- llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0)
102
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
103
- return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
104
 
105
  def get_shareable_link(file_data, file_name, file_type):
106
- b64 = base64.b64encode(file_data).decode()
107
- return f"data:{file_type};base64,{b64}"
108
 
109
  def main():
110
- st.sidebar.title("Quantilytix Grant Scraper")
111
- st.sidebar.image("logoqb.jpeg", use_container_width=True)
112
-
113
- # Multi-url input
114
- url_input = st.sidebar.text_area(
115
- "Enter URLs (one per line)",
116
- height=150,
117
- help="Enter multiple URLs separated by new lines"
118
- )
119
-
120
- if "scraped_data" not in st.session_state:
121
- st.session_state.scraped_data = None
122
-
123
- if st.sidebar.button("Get grants"):
124
- if url_input:
125
- urls = [url.strip() for url in url_input.split('\n') if url.strip()]
126
- if urls:
127
- try:
128
- with st.spinner("Starting scraping process..."):
129
- result = process_multiple_urls(urls)
130
- st.session_state.scraped_data = result
131
- st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
132
- except Exception as e:
133
- st.error(f"Error in scraping process: {e}")
134
- else:
135
- st.warning("Please enter valid URLs.")
136
- else:
137
- st.warning("Please enter at least one URL.")
138
-
139
- if st.session_state.scraped_data:
140
- selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
141
- result = st.session_state.scraped_data
142
-
143
- if selected_format == "CSV":
144
- file_data = convert_to_csv(result)
145
- file_name = "grants.csv"
146
- file_type = "text/csv"
147
- else:
148
- file_data = convert_to_excel(result)
149
- file_name = "grants.xlsx"
150
- file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
151
-
152
- b64 = base64.b64encode(file_data).decode()
153
- download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
154
- st.sidebar.markdown(download_link, unsafe_allow_html=True)
155
-
156
- shareable_link = get_shareable_link(file_data, file_name, file_type)
157
- st.sidebar.markdown("---")
158
- st.sidebar.markdown("**Share Options:**")
159
-
160
- whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
161
- st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
162
-
163
- email_subject = urllib.parse.quote("Check out this grants file")
164
- email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
165
- email_url = f"mailto:?subject={email_subject}&body={email_body}"
166
- st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
167
-
168
- with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
169
- st.dataframe(result['grants'])
170
-
171
- if st.sidebar.button("Load data as Knowledge Base"):
172
- st.session_state.qa_chain = create_knowledge_base(result)
173
- st.session_state.chat_interface_active = True
174
-
175
- if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
176
- st.header("Chat Interface Loaded. Start asking questions about the grants!")
177
- st.image("logoqb.jpeg", width=150)
178
- query = st.text_input("Ask a question about the grants:", key="chat_input")
179
-
180
- if query:
181
- if st.session_state.qa_chain:
182
- response = st.session_state.qa_chain({"question": query})
183
- st.session_state.chat_history.append({"query": query, "response": response['answer']})
184
- else:
185
- st.error("Knowledge base not loaded. Please load the knowledge base first.")
186
-
187
- for chat in st.session_state.chat_history:
188
- st.markdown(f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>", unsafe_allow_html=True)
189
- st.markdown(f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
190
 
191
  if __name__ == "__main__":
192
- main()
 
21
  GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
22
 
23
  graph_config = {
24
+     "llm": {
25
+         "api_key": GOOGLE_API_KEY,
26
+         "model": "google_genai/gemini-pro",
27
+     },
28
  }
29
 
30
  def get_data(url):
31
+     smart_scraper_graph = SmartScraperGraph(
32
+         prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries, sector and eligibility criteria for applicants.",
33
+         source=url,
34
+         config=graph_config
35
+     )
36
+     return smart_scraper_graph.run()
37
 
38
  def process_multiple_urls(urls):
39
+     """
40
+     Process multiple URLs with progress tracking
41
+     """
42
+     all_data = {"grants": []}
43
+     progress_bar = st.progress(0)
44
+     status_container = st.empty()
45
+     
46
+     total_urls = len(urls)
47
+     for index, url in enumerate(urls):
48
+         try:
49
+             url = url.strip()
50
+             if not url:
51
+                 continue
52
+                 
53
+             # Update progress
54
+             progress = (index + 1) / total_urls
55
+             progress_bar.progress(progress)
56
+             
57
+             # Show current status
58
+             status_container.markdown(f"""
59
+             **Processing URL {index+1} of {total_urls}**  
60
+             🔍 Scanning: `{url}`  
61
+             ✅ Completed: {index}/{total_urls}  
62
+             ⏳ Remaining: {total_urls - index - 1}
63
+             """)
64
+             
65
+             # Scrape data
66
+             result = get_data(url)
67
+             if result and 'grants' in result:
68
+                 all_data['grants'].extend(result['grants'])
69
+                 
70
+         except Exception as e:
71
+             st.error(f"Error processing {url}: {str(e)}")
72
+             continue
73
+             
74
+     progress_bar.empty()
75
+     status_container.empty()
76
+     return all_data
77
 
78
  def convert_to_csv(data):
79
+     df = pd.DataFrame(data['grants'])
80
+     return df.to_csv(index=False).encode('utf-8')
81
 
82
  def convert_to_excel(data):
83
+     df = pd.DataFrame(data['grants'])
84
+     buffer = io.BytesIO()
85
+     with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
86
+         df.to_excel(writer, sheet_name='Grants', index=False)
87
+     return buffer.getvalue()
88
 
89
  def create_knowledge_base(data):
90
+     documents = []
91
+     for grant in data['grants']:
92
+         doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
93
+         documents.append("\n".join(doc_parts))
94
 
95
+     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
96
+     texts = text_splitter.create_documents(documents)
97
 
98
+     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
99
+     vectorstore = FAISS.from_documents(texts, embeddings)
100
 
101
+     llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0)
102
+     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
103
+     return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
104
 
105
  def get_shareable_link(file_data, file_name, file_type):
106
+     b64 = base64.b64encode(file_data).decode()
107
+     return f"data:{file_type};base64,{b64}"
108
 
109
  def main():
110
+     st.sidebar.title("Quantilytix Grant Scraper")
111
+     st.sidebar.image("logoqb.jpeg", use_container_width=True)
112
+
113
+     # Initialize session state for scraped data and chat history if not already present
114
+     if "scraped_data" not in st.session_state:
115
+         st.session_state.scraped_data = None
116
+     if "chat_history" not in st.session_state:
117
+         st.session_state.chat_history = []
118
+     if "chat_interface_active" not in st.session_state:
119
+         st.session_state.chat_interface_active = False
120
+
121
+     # Multi-url input
122
+     url_input = st.sidebar.text_area(
123
+         "Enter URLs (one per line)",
124
+         height=150,
125
+         help="Enter multiple URLs separated by new lines"
126
+     )
127
+
128
+
129
+     if st.sidebar.button("Get grants"):
130
+         if url_input:
131
+             urls = [url.strip() for url in url_input.split('\n') if url.strip()]
132
+             if urls:
133
+                 try:
134
+                     with st.spinner("Starting scraping process..."):
135
+                         result = process_multiple_urls(urls)
136
+                         st.session_state.scraped_data = result
137
+                         st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
138
+                 except Exception as e:
139
+                     st.error(f"Error in scraping process: {e}")
140
+             else:
141
+                 st.warning("Please enter valid URLs.")
142
+         else:
143
+             st.warning("Please enter at least one URL.")
144
+
145
+     if st.session_state.scraped_data:
146
+         selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
147
+         result = st.session_state.scraped_data
148
+
149
+         if selected_format == "CSV":
150
+             file_data = convert_to_csv(result)
151
+             file_name = "grants.csv"
152
+             file_type = "text/csv"
153
+         else:
154
+             file_data = convert_to_excel(result)
155
+             file_name = "grants.xlsx"
156
+             file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
157
+
158
+         b64 = base64.b64encode(file_data).decode()
159
+         download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
160
+         st.sidebar.markdown(download_link, unsafe_allow_html=True)
161
+
162
+         shareable_link = get_shareable_link(file_data, file_name, file_type)
163
+         st.sidebar.markdown("---")
164
+         st.sidebar.markdown("**Share Options:**")
165
+         
166
+         whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
167
+         st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
168
+
169
+         email_subject = urllib.parse.quote("Check out this grants file")
170
+         email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
171
+         email_url = f"mailto:?subject={email_subject}&body={email_body}"
172
+         st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
173
+
174
+         with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
175
+             st.dataframe(result['grants'])
176
+
177
+         if st.sidebar.button("Load data as Knowledge Base"):
178
+             st.session_state.qa_chain = create_knowledge_base(result)
179
+             st.session_state.chat_interface_active = True
180
+             st.session_state.chat_history = [] # Initialize chat_history here when KB is loaded
181
+
182
+     if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
183
+         st.header("Chat Interface Loaded. Start asking questions about the grants!")
184
+         st.image("logoqb.jpeg", width=150)
185
+         query = st.text_input("Ask a question about the grants:", key="chat_input")
186
+         
187
+         if query:
188
+             if st.session_state.qa_chain:
189
+                 response = st.session_state.qa_chain({"question": query})
190
+                 st.session_state.chat_history.append({"query": query, "response": response['answer']})
191
+             else:
192
+                 st.error("Knowledge base not loaded. Please load the knowledge base first.")
193
+
194
+         if "chat_history" in st.session_state: # Check if chat_history exists before iterating
195
+             for chat in st.session_state.chat_history:
196
+                 st.markdown(f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>", unsafe_allow_html=True)
197
+                 st.markdown(f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>", unsafe_allow_html=True)
198
 
199
  if __name__ == "__main__":
200
+     main()