Spaces:

PyQuarX
/

scrape-with-ai

Paused

App Files Files Community

PyQuarX commited on May 8, 2025

Commit

b051531

verified ·

1 Parent(s): e540784

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -25

app.py CHANGED Viewed

@@ -1,37 +1,96 @@
 import streamlit as st
 from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
-from parse import parse
 from Data import markdown_to_csv
 st.title("AI Web Scraper")
-url = st.text_input("Enter a Website URL")
-if st.button("Scrape Site"):
-    st.write("Scraping the website")
-    result = scrape_website(url)
-    body_content = extract_body_content(result)
-    cleaned_content = clean_body_content(body_content)
-    st.session_state.dom_content = cleaned_content
-    with st.expander("View DOM Content"):
-        st.text_area("DOM Content", cleaned_content, height=300)
-if "dom_content" in st.session_state:
-    parse_description = st.text_area("Describe what you want to parse?")
     if st.button("Parse Content"):
         if parse_description:
-            st.write("Parsing Content")
-            dom_chunks = split_dom_content(st.session_state.dom_content)
-            result = parse(dom_chunks,parse_description)
-            print(repr(result))
-            # Appliquer la fonction
-            tables = markdown_to_csv(result)
-            for i in tables:
-                st.write(i)

 import streamlit as st
 from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
+from parse import parse, merge_tables_with_llm
+import streamlit as st
 from Data import markdown_to_csv
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+# Load OpenRouter API Key
+openrouter_api_key = "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"
+model = ChatOpenAI(
+    openai_api_key=openrouter_api_key,
+    model="meta-llama/llama-4-maverick:free",
+    base_url="https://openrouter.ai/api/v1"
+)
 st.title("AI Web Scraper")
+# Multi-URL Input
+urls = st.text_area("Enter Website URLs (one per line)", height=150)
+urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
+if st.button("Scrape Sites"):
+    all_results = []
+    for url in urls_list:
+        st.write(f"Scraping: {url}")
+        result = scrape_website(url)
+        body_content = extract_body_content(result)
+        cleaned_content = clean_body_content(body_content)
+        all_results.append(cleaned_content)
+    st.session_state.all_dom_content = all_results
+if "all_dom_content" in st.session_state:
+    parse_description = st.text_area("Describe what you want to parse from ALL sites:")
     if st.button("Parse Content"):
         if parse_description:
+            all_tables = []
+            for i, dom_content in enumerate(st.session_state.all_dom_content):
+                st.write(f"Parsing content from site {i+1}")
+                dom_chunks = split_dom_content(dom_content)
+                result = parse(dom_chunks, parse_description)
+                st.write("Raw LLM Output:")
+                st.write(result)
+                tables = markdown_to_csv(result)
+                if tables:
+                    st.write("Extracted Tables:")
+                    for table in tables:
+                        st.write(table)
+                        all_tables.append(table)
+                else:
+                    st.write("No tables found in the output.  Displaying raw output instead.")
+                    st.text_area("Raw Output", result, height=200)  # Display raw output
+            # Merge tables using LLM
+            if all_tables:
+                st.write("Merging all tables using LLM...")
+                merged_table_string = merge_tables_with_llm(all_tables, parse_description)
+                st.write("Merged Table (LLM Output):")
+                st.write(merged_table_string)
+                # Convert merged table string to DataFrame
+                merged_tables = markdown_to_csv(merged_table_string)
+                if merged_tables:
+                    st.write("Merged Table (DataFrame):")
+                    st.write(merged_tables[0])  # Display the first (and hopefully only) merged table
+                else:
+                    st.write("Could not convert merged table string to DataFrame.")
+            else:
+                st.write("No tables to merge.")
+def merge_tables_with_llm(tables, parse_description):
+    """Merges a list of Pandas DataFrames into a single Markdown table using LLM."""
+    from langchain_core.prompts import ChatPromptTemplate
+    from langchain_openai import ChatOpenAI
+    # Convert DataFrames to Markdown strings
+    table_strings = [table.to_markdown(index=False) for table in tables]
+    # Create a prompt for the LLM
+    merge_prompt = (
+        "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
+        "The tables contain information related to: {parse_description}.\n"
+        "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
+        "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
+        "Ensure the final output is a single valid Markdown table.\n\n"
+        "Here are the tables:\n\n" + "\n\n".join(table_strings) +
+        "\n\nReturn the merged table in Markdown format:"
+    )
+    # Invoke the LLM
+    response = model.invoke({"dom_content": "", "parse_description": merge_prompt})
+    return response.content