usmanyousaf commited on
Commit
5926a5d
Β·
verified Β·
1 Parent(s): 6a10786

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -66
app.py CHANGED
@@ -1,85 +1,41 @@
1
  import streamlit as st
2
  from scrape import scrape_website, extract_body_content, clean_body_content, split_dom_content
3
- from parse import parse_with_groq
4
 
5
- # Streamlit UI with sidebar
6
- st.set_page_config(page_title="Web Scraping App 🧠", page_icon="🌐")
7
-
8
- st.sidebar.title("πŸš€ Model Selection")
9
- selected_model = st.sidebar.selectbox(
10
- "Choose a Model for Parsing:",
11
- [
12
- "llama3-8b-8192",
13
- "distil-whisper-large-v3-en",
14
- "llama3-groq-70b-8192-tool-use-preview",
15
- "llama-3.1-8b-instant",
16
- "llava-v1.5-7b-4096-preview",
17
- "mixtral-8x7b-32768",
18
- ]
19
- )
20
-
21
- # Application title
22
- st.title("AI Web Scraper App 🌐")
23
- st.write("Easily scrape and analyze web content using advanced AI models. 🌟")
24
-
25
- # Input for website URL
26
- url = st.text_input("Enter Website URL πŸ”—")
27
 
28
  # Step 1: Scrape the Website
29
  if st.button("Scrape Website"):
30
  if url:
31
- st.write("πŸ•΅οΈβ€β™‚οΈ Scraping the website...")
32
 
33
- # Scrape the website
34
  dom_content = scrape_website(url)
35
- body_content = extract_body_content(dom_content)
36
- cleaned_content = clean_body_content(body_content)
 
37
 
38
- # Store the DOM content in Streamlit session state
39
- st.session_state.dom_content = cleaned_content
40
 
41
- # Display the DOM content in an expandable text box
42
- with st.expander("View DOM Content"):
43
- st.text_area("DOM Content", cleaned_content, height=300)
 
 
44
 
45
- # Step 2: Parse the Content
 
46
  if "dom_content" in st.session_state:
47
- parse_description = st.text_area("Describe what you want to parse πŸ“")
48
 
49
  if st.button("Parse Content"):
50
  if parse_description:
51
- st.write(f"πŸ€– Parsing the content with {selected_model}...")
52
 
53
- # Parse content using Groq
54
  dom_chunks = split_dom_content(st.session_state.dom_content)
55
- parsed_result = parse_with_groq(dom_chunks, parse_description, model=selected_model)
56
  st.write(parsed_result)
57
-
58
- # CSS for footer at the bottom of the sidebar
59
- st.markdown(
60
- """
61
- <style>
62
- .footer {
63
- position: fixed;
64
- bottom: 0;
65
- left: 0;
66
- width: 100%;
67
- background-color: #272432; /* Dark background for visibility */
68
- color: white;
69
- text-align: center;
70
- padding: 10px;
71
- font-size: 14px;
72
- }
73
- .sidebar .footer {
74
- position: fixed;
75
- bottom: 0;
76
- }
77
- </style>
78
-
79
- <div class="footer">
80
- Made with ❀️ by Usman Yousaf πŸš€<br>
81
- Feel free to improve and expand this app for more powerful insights! πŸ”₯
82
- </div>
83
- """,
84
- unsafe_allow_html=True
85
- )
 
1
  import streamlit as st
2
  from scrape import scrape_website, extract_body_content, clean_body_content, split_dom_content
3
+ from parse import parse_with_ollama
4
 
5
+ # Streamlit UI
6
+ st.title("AI Web Scraper")
7
+ url = st.text_input("Enter Website URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Step 1: Scrape the Website
10
  if st.button("Scrape Website"):
11
  if url:
12
+ st.write("Scraping the website...")
13
 
14
+ # Scrape the website using requests and BeautifulSoup
15
  dom_content = scrape_website(url)
16
+ if dom_content:
17
+ body_content = extract_body_content(dom_content)
18
+ cleaned_content = clean_body_content(body_content)
19
 
20
+ # Store the cleaned DOM content in Streamlit session state
21
+ st.session_state.dom_content = cleaned_content
22
 
23
+ # Display the cleaned DOM content in an expandable text box
24
+ with st.expander("View Cleaned DOM Content"):
25
+ st.text_area("DOM Content", cleaned_content, height=300)
26
+ else:
27
+ st.error("Failed to scrape the website. Please check the URL.")
28
 
29
+
30
+ # Step 2: Ask Questions About the DOM Content
31
  if "dom_content" in st.session_state:
32
+ parse_description = st.text_area("Describe what you want to parse")
33
 
34
  if st.button("Parse Content"):
35
  if parse_description:
36
+ st.write("Parsing the content...")
37
 
38
+ # Parse the content with Ollama
39
  dom_chunks = split_dom_content(st.session_state.dom_content)
40
+ parsed_result = parse_with_ollama(dom_chunks, parse_description)
41
  st.write(parsed_result)