usmanyousaf commited on
Commit
2d64614
·
verified ·
1 Parent(s): 277a05c

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. chromedriver +3 -0
  3. main.py +85 -0
  4. parse.py +35 -0
  5. requirements.txt +5 -0
  6. scrape.py +60 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chromedriver filter=lfs diff=lfs merge=lfs -text
chromedriver ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2ff8d0f3628d5684b8ce86ec32790ebf13fecca004a938270b119a93608ef09
3
+ size 19025744
main.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from scrape import scrape_website, extract_body_content, clean_body_content, split_dom_content
3
+ from parse import parse_with_groq
4
+
5
+ # Streamlit UI with sidebar
6
+ st.set_page_config(page_title="Web Scraping App 🧠", page_icon="🌐")
7
+
8
+ st.sidebar.title("🚀 Model Selection")
9
+ selected_model = st.sidebar.selectbox(
10
+ "Choose a Model for Parsing:",
11
+ [
12
+ "llama3-8b-8192",
13
+ "distil-whisper-large-v3-en",
14
+ "llama3-groq-70b-8192-tool-use-preview",
15
+ "llama-3.1-8b-instant",
16
+ "llava-v1.5-7b-4096-preview",
17
+ "mixtral-8x7b-32768",
18
+ ]
19
+ )
20
+
21
+ # Application title
22
+ st.title("AI Web Scraper App 🌐")
23
+ st.write("Easily scrape and analyze web content using advanced AI models. 🌟")
24
+
25
+ # Input for website URL
26
+ url = st.text_input("Enter Website URL 🔗")
27
+
28
+ # Step 1: Scrape the Website
29
+ if st.button("Scrape Website"):
30
+ if url:
31
+ st.write("🕵️‍♂️ Scraping the website...")
32
+
33
+ # Scrape the website
34
+ dom_content = scrape_website(url)
35
+ body_content = extract_body_content(dom_content)
36
+ cleaned_content = clean_body_content(body_content)
37
+
38
+ # Store the DOM content in Streamlit session state
39
+ st.session_state.dom_content = cleaned_content
40
+
41
+ # Display the DOM content in an expandable text box
42
+ with st.expander("View DOM Content"):
43
+ st.text_area("DOM Content", cleaned_content, height=300)
44
+
45
+ # Step 2: Parse the Content
46
+ if "dom_content" in st.session_state:
47
+ parse_description = st.text_area("Describe what you want to parse 📝")
48
+
49
+ if st.button("Parse Content"):
50
+ if parse_description:
51
+ st.write(f"🤖 Parsing the content with {selected_model}...")
52
+
53
+ # Parse content using Groq
54
+ dom_chunks = split_dom_content(st.session_state.dom_content)
55
+ parsed_result = parse_with_groq(dom_chunks, parse_description, model=selected_model)
56
+ st.write(parsed_result)
57
+
58
+ # CSS for footer at the bottom of the sidebar
59
+ st.markdown(
60
+ """
61
+ <style>
62
+ .footer {
63
+ position: fixed;
64
+ bottom: 0;
65
+ left: 0;
66
+ width: 100%;
67
+ background-color: #272432; /* Dark background for visibility */
68
+ color: white;
69
+ text-align: center;
70
+ padding: 10px;
71
+ font-size: 14px;
72
+ }
73
+ .sidebar .footer {
74
+ position: fixed;
75
+ bottom: 0;
76
+ }
77
+ </style>
78
+
79
+ <div class="footer">
80
+ Made with ❤️ by Usman Yousaf 🚀<br>
81
+ Feel free to improve and expand this app for more powerful insights! 🔥
82
+ </div>
83
+ """,
84
+ unsafe_allow_html=True
85
+ )
parse.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from groq import Groq
2
+
3
+ # Initialize Groq client with the API key directly
4
+ client = Groq(api_key="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")
5
+
6
+ # Define the template for parsing
7
+ template = (
8
+ "You are tasked with extracting specific information from the following text content: {dom_content}. "
9
+ "Please follow these instructions carefully: \n\n"
10
+ "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
11
+ "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
12
+ "3. **Empty Response:** If no information matches the description, return an empty string ('')."
13
+ "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
14
+ )
15
+
16
+ def parse_with_groq(dom_chunks, parse_description, model="llama3-8b-8192"):
17
+ parsed_results = []
18
+
19
+ for i, chunk in enumerate(dom_chunks, start=1):
20
+ # Prepare the prompt
21
+ prompt = template.format(dom_content=chunk, parse_description=parse_description)
22
+
23
+ # Send prompt to Groq for processing, specifying the model
24
+ response = client.chat.completions.create(
25
+ messages=[
26
+ {"role": "user", "content": prompt}
27
+ ],
28
+ model=model # Specify the model
29
+ )
30
+
31
+ # Print status and store result
32
+ print(f"Parsed batch: {i} of {len(dom_chunks)}")
33
+ parsed_results.append(response.choices[0].message.content) # Access the content
34
+
35
+ return "\n".join(parsed_results)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ selenium
3
+ beautifulsoup4
4
+ groq
5
+ python-dotenv
scrape.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ from selenium.webdriver.chrome.options import Options
4
+ from bs4 import BeautifulSoup
5
+ from dotenv import load_dotenv
6
+ import os
7
+ import time
8
+
9
+ load_dotenv()
10
+
11
+ CHROME_DRIVER_PATH = os.getenv("./chromedriver")
12
+
13
+ def scrape_website(website):
14
+ print("Connecting to Chrome Browser...")
15
+
16
+ # Setup ChromeDriver service and options
17
+ service = Service(CHROME_DRIVER_PATH)
18
+ options = Options()
19
+ driver = webdriver.Chrome(service=service, options=options)
20
+
21
+ try:
22
+ driver.get(website)
23
+ print("Waiting for CAPTCHA to be solved manually (if present)...")
24
+
25
+ # Optional waiting loop for manual CAPTCHA solving
26
+ while "captcha" in driver.page_source.lower():
27
+ print("CAPTCHA detected, waiting...")
28
+ time.sleep(5)
29
+
30
+ print("CAPTCHA solved or not present. Scraping page content...")
31
+ html = driver.page_source
32
+ return html
33
+
34
+ finally:
35
+ driver.quit()
36
+
37
+ def extract_body_content(html_content):
38
+ soup = BeautifulSoup(html_content, "html.parser")
39
+ body_content = soup.body
40
+ if body_content:
41
+ return str(body_content)
42
+ return ""
43
+
44
+ def clean_body_content(body_content):
45
+ soup = BeautifulSoup(body_content, "html.parser")
46
+
47
+ for script_or_style in soup(["script", "style"]):
48
+ script_or_style.extract()
49
+
50
+ cleaned_content = soup.get_text(separator="\n")
51
+ cleaned_content = "\n".join(
52
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
53
+ )
54
+
55
+ return cleaned_content
56
+
57
+ def split_dom_content(dom_content, max_length=6000):
58
+ return [
59
+ dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
60
+ ]