Spaces:

Makima57
/

query-app

Sleeping

App Files Files Community

Makima57 commited on Sep 25, 2024

Commit

df2d3aa

verified ·

1 Parent(s): 330cdb8

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -26

app.py CHANGED Viewed

@@ -2,13 +2,19 @@ import streamlit as st
 from googlesearch import search
 import requests
 from bs4 import BeautifulSoup
-import chunk  # Import the chunking function from chunk.py
 # Function to perform Google search and return the first two links
 def google_search(query):
     try:
-        search_results = search(query, num_results=2)  # Get first two results
-        first_two_links = [next(search_results, None), next(search_results, None)]
         return first_two_links
     except Exception as e:
         st.error(f"An error occurred: {e}")
@@ -28,11 +34,15 @@ def fetch_webpage_content(url):
 def scrape_text(webpage_content):
     try:
         soup = BeautifulSoup(webpage_content, 'html.parser')
         for script in soup(["script", "style"]):
             script.decompose()
         text = soup.get_text()
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
         text = '\n'.join(chunk for chunk in chunks if chunk)
         return text
     except Exception as e:
@@ -40,7 +50,7 @@ def scrape_text(webpage_content):
         return None
 # Streamlit app UI
-st.title("Search and Chunk Webpage Content")
 # Input field for search query
 query = st.text_input("Enter search query", "")
@@ -50,8 +60,8 @@ if st.button("Search"):
     if query:
         first_two_links = google_search(query)
         if first_two_links:
-            for i, link in enumerate(first_two_links, 1):
-                st.success(f"Link {i}: [Click here]({link})")
                 # Fetch webpage content
                 webpage_content = fetch_webpage_content(link)
@@ -59,28 +69,19 @@ if st.button("Search"):
                     # Scrape text from webpage content
                     scraped_text = scrape_text(webpage_content)
                     if scraped_text:
-                        # Chunk the scraped text using chunk.py
-                        chunked_text = chunk.chunk_text(scraped_text)
-                        chunk.display_chunks(chunked_text)
-                        # Save chunked data to a .txt file for later use
-                        file_name = f"chunked_data_link_{i}.txt"
-                        with open(file_name, "w") as f:
-                            f.write("\n---\n".join(chunked_text))  # Separate chunks by a line break and delimiter
-                        st.write(f"Chunked Data for Link {i}:")
-                        for chunk_part in chunked_text:
-                            st.write(chunk_part)
-                        # Provide a unique key for each download button
                         st.download_button(
-                            label=f"Download Chunked Webpage Content for Link {i}",
-                            data="\n---\n".join(chunked_text),
-                            file_name=file_name,
-                            mime="text/plain",
-                            key=f"download_button_{i}"  # Unique key for each button
                         )
         else:
             st.warning("No results found")
     else:
-        st.error("Please enter a query")

 from googlesearch import search
 import requests
 from bs4 import BeautifulSoup
+import chunk  # Import the chunking functionality from app2.py
 # Function to perform Google search and return the first two links
 def google_search(query):
     try:
+        query = query + "/t site:https://medium.com/"
+        search_results = search(query, num_results=10)  # Get up to 10 results
+        first_two_links = []
+        for i, link in enumerate(search_results):
+            if i < 2:
+                first_two_links.append(link)
+            else:
+                break
         return first_two_links
     except Exception as e:
         st.error(f"An error occurred: {e}")
 def scrape_text(webpage_content):
     try:
         soup = BeautifulSoup(webpage_content, 'html.parser')
+        # Remove all script and style elements
         for script in soup(["script", "style"]):
             script.decompose()
         text = soup.get_text()
+        # Break the text into lines and remove leading/trailing spaces
         lines = (line.strip() for line in text.splitlines())
+        # Break multi-headlines into a line each
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        # Drop blank lines
         text = '\n'.join(chunk for chunk in chunks if chunk)
         return text
     except Exception as e:
         return None
 # Streamlit app UI
+st.title("Search Link Finder")
 # Input field for search query
 query = st.text_input("Enter search query", "")
     if query:
         first_two_links = google_search(query)
         if first_two_links:
+            for i, link in enumerate(first_two_links):
+                st.success(f"Link {i+1}: [Click here]({link})")
                 # Fetch webpage content
                 webpage_content = fetch_webpage_content(link)
                     # Scrape text from webpage content
                     scraped_text = scrape_text(webpage_content)
                     if scraped_text:
+                        st.write(f"Scraped Content from Link {i+1} (Chunked):")
+                        # Call the chunking function from app2.py
+                        chunk.display_chunks(scraped_text)
+                        # Option to download the entire scraped content
                         st.download_button(
+                            label=f"Download Full Webpage Content from Link {i+1}",
+                            data=scraped_text,
+                            file_name=f"webpage_content_{i+1}.txt",
+                            mime="text/plain"
                         )
         else:
             st.warning("No results found")
     else:
+        st.error("Please enter a query")