Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,172 +21,180 @@ nest_asyncio.apply()
|
|
| 21 |
GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
|
| 22 |
|
| 23 |
graph_config = {
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
}
|
| 29 |
|
| 30 |
def get_data(url):
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
|
| 38 |
def process_multiple_urls(urls):
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
|
| 78 |
def convert_to_csv(data):
|
| 79 |
-
|
| 80 |
-
|
| 81 |
|
| 82 |
def convert_to_excel(data):
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
def create_knowledge_base(data):
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
|
| 105 |
def get_shareable_link(file_data, file_name, file_type):
|
| 106 |
-
|
| 107 |
-
|
| 108 |
|
| 109 |
def main():
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
if __name__ == "__main__":
|
| 192 |
-
|
|
|
|
| 21 |
GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
|
| 22 |
|
| 23 |
graph_config = {
|
| 24 |
+
"llm": {
|
| 25 |
+
"api_key": GOOGLE_API_KEY,
|
| 26 |
+
"model": "google_genai/gemini-pro",
|
| 27 |
+
},
|
| 28 |
}
|
| 29 |
|
| 30 |
def get_data(url):
|
| 31 |
+
smart_scraper_graph = SmartScraperGraph(
|
| 32 |
+
prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries, sector and eligibility criteria for applicants.",
|
| 33 |
+
source=url,
|
| 34 |
+
config=graph_config
|
| 35 |
+
)
|
| 36 |
+
return smart_scraper_graph.run()
|
| 37 |
|
| 38 |
def process_multiple_urls(urls):
|
| 39 |
+
"""
|
| 40 |
+
Process multiple URLs with progress tracking
|
| 41 |
+
"""
|
| 42 |
+
all_data = {"grants": []}
|
| 43 |
+
progress_bar = st.progress(0)
|
| 44 |
+
status_container = st.empty()
|
| 45 |
+
|
| 46 |
+
total_urls = len(urls)
|
| 47 |
+
for index, url in enumerate(urls):
|
| 48 |
+
try:
|
| 49 |
+
url = url.strip()
|
| 50 |
+
if not url:
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
# Update progress
|
| 54 |
+
progress = (index + 1) / total_urls
|
| 55 |
+
progress_bar.progress(progress)
|
| 56 |
+
|
| 57 |
+
# Show current status
|
| 58 |
+
status_container.markdown(f"""
|
| 59 |
+
**Processing URL {index+1} of {total_urls}**
|
| 60 |
+
🔍 Scanning: `{url}`
|
| 61 |
+
✅ Completed: {index}/{total_urls}
|
| 62 |
+
⏳ Remaining: {total_urls - index - 1}
|
| 63 |
+
""")
|
| 64 |
+
|
| 65 |
+
# Scrape data
|
| 66 |
+
result = get_data(url)
|
| 67 |
+
if result and 'grants' in result:
|
| 68 |
+
all_data['grants'].extend(result['grants'])
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
st.error(f"Error processing {url}: {str(e)}")
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
progress_bar.empty()
|
| 75 |
+
status_container.empty()
|
| 76 |
+
return all_data
|
| 77 |
|
| 78 |
def convert_to_csv(data):
|
| 79 |
+
df = pd.DataFrame(data['grants'])
|
| 80 |
+
return df.to_csv(index=False).encode('utf-8')
|
| 81 |
|
| 82 |
def convert_to_excel(data):
|
| 83 |
+
df = pd.DataFrame(data['grants'])
|
| 84 |
+
buffer = io.BytesIO()
|
| 85 |
+
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
| 86 |
+
df.to_excel(writer, sheet_name='Grants', index=False)
|
| 87 |
+
return buffer.getvalue()
|
| 88 |
|
| 89 |
def create_knowledge_base(data):
|
| 90 |
+
documents = []
|
| 91 |
+
for grant in data['grants']:
|
| 92 |
+
doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
|
| 93 |
+
documents.append("\n".join(doc_parts))
|
| 94 |
|
| 95 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 96 |
+
texts = text_splitter.create_documents(documents)
|
| 97 |
|
| 98 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
|
| 99 |
+
vectorstore = FAISS.from_documents(texts, embeddings)
|
| 100 |
|
| 101 |
+
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0)
|
| 102 |
+
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
|
| 103 |
+
return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
|
| 104 |
|
| 105 |
def get_shareable_link(file_data, file_name, file_type):
|
| 106 |
+
b64 = base64.b64encode(file_data).decode()
|
| 107 |
+
return f"data:{file_type};base64,{b64}"
|
| 108 |
|
| 109 |
def main():
|
| 110 |
+
st.sidebar.title("Quantilytix Grant Scraper")
|
| 111 |
+
st.sidebar.image("logoqb.jpeg", use_container_width=True)
|
| 112 |
+
|
| 113 |
+
# Initialize session state for scraped data and chat history if not already present
|
| 114 |
+
if "scraped_data" not in st.session_state:
|
| 115 |
+
st.session_state.scraped_data = None
|
| 116 |
+
if "chat_history" not in st.session_state:
|
| 117 |
+
st.session_state.chat_history = []
|
| 118 |
+
if "chat_interface_active" not in st.session_state:
|
| 119 |
+
st.session_state.chat_interface_active = False
|
| 120 |
+
|
| 121 |
+
# Multi-url input
|
| 122 |
+
url_input = st.sidebar.text_area(
|
| 123 |
+
"Enter URLs (one per line)",
|
| 124 |
+
height=150,
|
| 125 |
+
help="Enter multiple URLs separated by new lines"
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
if st.sidebar.button("Get grants"):
|
| 130 |
+
if url_input:
|
| 131 |
+
urls = [url.strip() for url in url_input.split('\n') if url.strip()]
|
| 132 |
+
if urls:
|
| 133 |
+
try:
|
| 134 |
+
with st.spinner("Starting scraping process..."):
|
| 135 |
+
result = process_multiple_urls(urls)
|
| 136 |
+
st.session_state.scraped_data = result
|
| 137 |
+
st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
|
| 138 |
+
except Exception as e:
|
| 139 |
+
st.error(f"Error in scraping process: {e}")
|
| 140 |
+
else:
|
| 141 |
+
st.warning("Please enter valid URLs.")
|
| 142 |
+
else:
|
| 143 |
+
st.warning("Please enter at least one URL.")
|
| 144 |
+
|
| 145 |
+
if st.session_state.scraped_data:
|
| 146 |
+
selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
|
| 147 |
+
result = st.session_state.scraped_data
|
| 148 |
+
|
| 149 |
+
if selected_format == "CSV":
|
| 150 |
+
file_data = convert_to_csv(result)
|
| 151 |
+
file_name = "grants.csv"
|
| 152 |
+
file_type = "text/csv"
|
| 153 |
+
else:
|
| 154 |
+
file_data = convert_to_excel(result)
|
| 155 |
+
file_name = "grants.xlsx"
|
| 156 |
+
file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 157 |
+
|
| 158 |
+
b64 = base64.b64encode(file_data).decode()
|
| 159 |
+
download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
|
| 160 |
+
st.sidebar.markdown(download_link, unsafe_allow_html=True)
|
| 161 |
+
|
| 162 |
+
shareable_link = get_shareable_link(file_data, file_name, file_type)
|
| 163 |
+
st.sidebar.markdown("---")
|
| 164 |
+
st.sidebar.markdown("**Share Options:**")
|
| 165 |
+
|
| 166 |
+
whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
|
| 167 |
+
st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
|
| 168 |
+
|
| 169 |
+
email_subject = urllib.parse.quote("Check out this grants file")
|
| 170 |
+
email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
|
| 171 |
+
email_url = f"mailto:?subject={email_subject}&body={email_body}"
|
| 172 |
+
st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
|
| 173 |
+
|
| 174 |
+
with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
|
| 175 |
+
st.dataframe(result['grants'])
|
| 176 |
+
|
| 177 |
+
if st.sidebar.button("Load data as Knowledge Base"):
|
| 178 |
+
st.session_state.qa_chain = create_knowledge_base(result)
|
| 179 |
+
st.session_state.chat_interface_active = True
|
| 180 |
+
st.session_state.chat_history = [] # Initialize chat_history here when KB is loaded
|
| 181 |
+
|
| 182 |
+
if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
|
| 183 |
+
st.header("Chat Interface Loaded. Start asking questions about the grants!")
|
| 184 |
+
st.image("logoqb.jpeg", width=150)
|
| 185 |
+
query = st.text_input("Ask a question about the grants:", key="chat_input")
|
| 186 |
+
|
| 187 |
+
if query:
|
| 188 |
+
if st.session_state.qa_chain:
|
| 189 |
+
response = st.session_state.qa_chain({"question": query})
|
| 190 |
+
st.session_state.chat_history.append({"query": query, "response": response['answer']})
|
| 191 |
+
else:
|
| 192 |
+
st.error("Knowledge base not loaded. Please load the knowledge base first.")
|
| 193 |
+
|
| 194 |
+
if "chat_history" in st.session_state: # Check if chat_history exists before iterating
|
| 195 |
+
for chat in st.session_state.chat_history:
|
| 196 |
+
st.markdown(f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>", unsafe_allow_html=True)
|
| 197 |
+
st.markdown(f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>", unsafe_allow_html=True)
|
| 198 |
|
| 199 |
if __name__ == "__main__":
|
| 200 |
+
main()
|