khatri-indra commited on
Commit
5f82c3d
·
verified ·
1 Parent(s): ee611a0

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. chromedriver.exe +3 -0
  3. main.py +27 -0
  4. parse.py +29 -0
  5. requirements.txt +8 -0
  6. scapping.py +46 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chromedriver.exe filter=lfs diff=lfs merge=lfs -text
chromedriver.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55e67707e3ca5b68b16d1bf436c3c7cdd845977846bda16fb89308a6994e6006
3
+ size 17792000
main.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from scapping import scrape_website, split_dom_content, clean_body_content, extract_body_content
3
+ from parse import parse_with_gemini
4
+
5
+ st.title("Web Scrapper")
6
+ url = st.text_input("Enter the Web URL:", placeholder="URL")
7
+
8
+ if st.button("Start Scraping"):
9
+ st.write("Scrapping...")
10
+ result = scrape_website(url)
11
+
12
+ body_content = extract_body_content(result)
13
+ cleaned_content = clean_body_content(body_content)
14
+
15
+ st.session_state.dom_content = cleaned_content
16
+ with st.expander("View DOM Content"):
17
+ st.text_area("DOM Content", cleaned_content, height=300)
18
+
19
+ if "dom_content" in st.session_state:
20
+ parse_description = st.text_area("Describe what you want to parse?")
21
+
22
+ if st.button("Parse Content"):
23
+ if parse_description:
24
+ st.write("Parsing the content...")
25
+ dom_chunks = split_dom_content(st.session_state.dom_content)
26
+ result = parse_with_gemini(dom_chunks, parse_description)
27
+ st.write(result)
parse.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_google_genai import GoogleGenerativeAI
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+
7
+
8
+ llm = GoogleGenerativeAI(model="gemini-pro")
9
+
10
+ template = (
11
+ "You are tasked with extracting specific information from the following text content: {dom_content}. "
12
+ "Please follow these instructions carefully: \n\n"
13
+ "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
14
+ "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
15
+ "3. **Empty Response:** If no information matches the description, return an empty string ('')."
16
+ "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
17
+ )
18
+
19
+ def parse_with_gemini(dom_chunks, parse_description):
20
+ prompt = ChatPromptTemplate.from_template(template)
21
+ chain = prompt | llm
22
+
23
+ parsed_results = []
24
+ for i, chunk in enumerate(dom_chunks, start=1):
25
+ response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
26
+ print(f"Parsed batch {i} of {len(dom_chunks)}")
27
+ parsed_results.append(response)
28
+
29
+ return "\n".join(parsed_results)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain_ollama
4
+ selenium
5
+ beautifulsoup4
6
+ lxml
7
+ html5lib
8
+ python-dotenv
scapping.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import selenium.webdriver as webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ import time
4
+ from bs4 import BeautifulSoup
5
+
6
+ def scrape_website(website):
7
+ print("launching chrome browser...")
8
+
9
+ chrome_driver_path = "chromedriver.exe"
10
+ options = webdriver.ChromeOptions()
11
+ driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
12
+
13
+ try:
14
+ driver.get(website)
15
+ print('page loaded..,')
16
+ html = driver.page_source
17
+ time.sleep(10)
18
+
19
+ return html
20
+ finally:
21
+ driver.quit()
22
+
23
+ def extract_body_content(html_content):
24
+ soup = BeautifulSoup(html_content, "html.parser")
25
+ body_content = soup.body
26
+ if body_content:
27
+ return str(body_content)
28
+ return ""
29
+
30
+ def clean_body_content(body_content):
31
+ soup = BeautifulSoup(body_content, "html.parser")
32
+
33
+ for script_or_style in soup({"script", "style"}):
34
+ script_or_style.extract()
35
+
36
+ cleaned_content = soup.get_text(separator="\n")
37
+ cleaned_content = "\n".join(
38
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
39
+ )
40
+
41
+ return cleaned_content
42
+
43
+ def split_dom_content(dom_content,max_length=6000):
44
+ return [
45
+ dom_content[i: i+max_length] for i in range(0, len(dom_content), max_length)
46
+ ]