Spaces:
Paused
Paused
| import streamlit as st | |
| from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content | |
| from parse import parse, merge_tables_with_llm | |
| import streamlit as st | |
| from Data import markdown_to_csv | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_openai import ChatOpenAI | |
| st.title("AI Web Scraper") | |
| # Multi-URL Input | |
| urls = st.text_area("Enter Website URLs (one per line)", height=150) | |
| urls_list = [url.strip() for url in urls.splitlines() if url.strip()] | |
| if st.button("Scrape Sites"): | |
| all_results = [] | |
| for url in urls_list: | |
| st.write(f"Scraping: {url}") | |
| result = scrape_website(url) | |
| body_content = extract_body_content(result) | |
| cleaned_content = clean_body_content(body_content) | |
| all_results.append(cleaned_content) | |
| st.session_state.all_dom_content = all_results | |
| if "all_dom_content" in st.session_state: | |
| parse_description = st.text_area("Describe what you want to parse from ALL sites:") | |
| if st.button("Parse Content"): | |
| if parse_description: | |
| all_tables = [] | |
| for i, dom_content in enumerate(st.session_state.all_dom_content): | |
| st.write(f"Parsing content from site {i+1}") | |
| dom_chunks = split_dom_content(dom_content) | |
| result = parse(dom_chunks, parse_description) | |
| st.write("Raw LLM Output:") | |
| st.write(result) | |
| tables = markdown_to_csv(result) | |
| if tables: | |
| st.write("Extracted Tables:") | |
| for table in tables: | |
| st.write(table) | |
| all_tables.append(table) | |
| else: | |
| st.write("No tables found in the output. Displaying raw output instead.") | |
| st.text_area("Raw Output", result, height=200) # Display raw output | |
| # Merge tables using LLM | |
| if all_tables: | |
| st.write("Merging all tables using LLM...") | |
| merged_table_string = merge_tables_with_llm(all_tables, parse_description) | |
| st.write("Merged Table (LLM Output):") | |
| st.write(merged_table_string) | |
| # Convert merged table string to DataFrame | |
| merged_tables = markdown_to_csv(merged_table_string) | |
| if merged_tables: | |
| st.write("Merged Table (DataFrame):") | |
| st.write(merged_tables[0]) # Display the first (and hopefully only) merged table | |
| else: | |
| st.write("Could not convert merged table string to DataFrame.") | |
| else: | |
| st.write("No tables to merge.") | |