Spaces:

mibrahimzia
/

Parse-AI

Sleeping

App Files Files Community

mibrahimzia commited on Oct 15, 2025

Commit

3e39a1c

verified ·

1 Parent(s): b84fcf0

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -28

app.py CHANGED Viewed

@@ -1,40 +1,117 @@
 import gradio as gr
-from llm_utils import generate_scraper_instruction, answer_query
-from scraper import scrape_website
-from processor import refine_content
-from utils import export_data
-def run_pipeline(user_prompt):
     try:
-        refined_task = generate_scraper_instruction(user_prompt)
-        scraped = scrape_website(refined_task["url"])
-        clean_content = refine_content(scraped["content"], refined_task["intent"])
-        export_paths = export_data(user_prompt, scraped, clean_content)
-        return f"**Refined Instruction:** {refined_task}\n\n**Clean Content Preview:**\n{clean_content[:1000]}...", export_paths
     except Exception as e:
-        return f"❌ Error: {str(e)}", {}
-def qna_response(query, context):
-    return answer_query(query, context)
-with gr.Blocks(title="AI-Assisted Data Pipeline") as demo:
-    gr.Markdown("## 🌐 AI-Assisted Web Data Pipeline\nType your query and let AI handle the web data extraction and analysis.")
-    user_prompt = gr.Textbox(label="Your request", placeholder="e.g., Get recent AI startup funding news")
-    run_btn = gr.Button("Run Pipeline")
-    result_output = gr.Markdown()
-    files_output = gr.File(label="Download Processed Data")
-    context_state = gr.State()
-    run_btn.click(fn=run_pipeline, inputs=[user_prompt], outputs=[result_output, files_output], show_progress="full")
-    gr.Markdown("### 🧩 Q&A on Extracted Data")
-    question = gr.Textbox(label="Ask a question about the scraped content")
-    answer_box = gr.Textbox(label="AI Answer", interactive=False)
-    ask_btn = gr.Button("Ask")
-    ask_btn.click(fn=qna_response, inputs=[question, context_state], outputs=answer_box)
-demo.launch()

+# app.py
+import os
+import json
+import pandas as pd
+from fpdf import FPDF
 import gradio as gr
+from scraper import scrape
+from qa_openrouter import ask_openrouter
+DATA_JSON = "scraped_data.json"
+DATA_CSV = "scraped_data.csv"
+DATA_PDF = "scraped_data.pdf"
+def save_data(data: dict):
+    # Flatten a minimal tabular form for CSV: title, url, first_heading, excerpt
+    rows = [{
+        "url": data.get("url",""),
+        "title": data.get("title",""),
+        "first_heading": (data.get("headings") or [""])[0] if data.get("headings") else "",
+        "excerpt": (data.get("text") or "")[:500]
+    }]
+    df = pd.DataFrame(rows)
+    df.to_csv(DATA_CSV, index=False)
+    with open(DATA_JSON, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    # Save PDF (very simple)
+    pdf = FPDF()
+    pdf.set_auto_page_break(auto=True, margin=12)
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    text = f"Title: {data.get('title','')}\nURL: {data.get('url','')}\n\n"
+    # include headings and excerpt
+    if data.get("headings"):
+        text += "Headings:\n" + "\n".join(data["headings"][:10]) + "\n\n"
+    text += "Excerpt:\n" + (data.get("text") or "")[:4000]
+    for line in text.split("\n"):
+        pdf.multi_cell(0, 6, line)
+    pdf.output(DATA_PDF)
+def do_scrape(url: str, instruction: str, force_render: bool):
     try:
+        data = scrape(url, force_render=force_render)
     except Exception as e:
+        return f"Scrape failed: {e}", None
+    # optionally apply instruction: we will filter/collect based on simple instruction heuristics
+    # For minimal version, just save full data and return short summary
+    save_data(data)
+    summary = {
+        "title": data.get("title",""),
+        "url": data.get("url",""),
+        "headings_count": len(data.get("headings") or []),
+        "links_count": len(data.get("links") or []),
+        "text_chars": len(data.get("text") or "")
+    }
+    return "Scrape successful.", summary
+def load_data_for_qa() -> str:
+    if not os.path.exists(DATA_JSON):
+        return ""
+    with open(DATA_JSON, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    # Combine title, headings and text
+    parts = []
+    if data.get("title"):
+        parts.append(f"Title: {data['title']}")
+    if data.get("headings"):
+        parts.append("Headings:\n" + "\n".join(data["headings"]))
+    if data.get("text"):
+        parts.append("Content:\n" + data["text"])
+    return "\n\n".join(parts)
+def do_qa(question: str, model: str = "openrouter/auto"):
+    ctx = load_data_for_qa()
+    if not ctx:
+        return "No scraped data available. Run a scrape first."
+    try:
+        answer = ask_openrouter(question, ctx, model=model)
+        return answer
+    except Exception as e:
+        return f"OpenRouter request failed: {e}"
+# Build Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# Minimal AI Scraper + Q/A (OpenRouter)\nEnter a URL and an instruction. Scrapes via Requests/BeautifulSoup and Playwright (fallback). Stores data and enables Q/A via OpenRouter.")
+    with gr.Row():
+        url_in = gr.Textbox(label="URL", placeholder="https://example.com", lines=1)
+        instr = gr.Textbox(label="Instruction (optional)", placeholder="E.g., 'Get article titles'", lines=1)
+    with gr.Row():
+        force_chk = gr.Checkbox(label="Force render with Playwright (JS render)", value=False)
+        scrape_btn = gr.Button("Scrape")
+    result_txt = gr.Textbox(label="Status", interactive=False)
+    summary_out = gr.JSON(label="Summary (saved data metadata)")
+    with gr.Row():
+        download_csv = gr.File(label="Download CSV", visible=True)
+        download_json = gr.File(label="Download JSON", visible=True)
+        download_pdf = gr.File(label="Download PDF", visible=True)
+    gr.Markdown("## Q/A about scraped data")
+    qa_question = gr.Textbox(label="Ask a question about the scraped data", lines=2)
+    model_choice = gr.Textbox(label="OpenRouter model (optional)", value="openrouter/auto", lines=1)
+    qa_btn = gr.Button("Ask")
+    qa_answer = gr.Textbox(label="Answer", interactive=False)
+    def on_scrape(url, instruction, force):
+        status, summary = do_scrape(url, instruction, force)
+        # update file components (they will point to local files)
+        csvf = DATA_CSV if os.path.exists(DATA_CSV) else None
+        jsonf = DATA_JSON if os.path.exists(DATA_JSON) else None
+        pdff = DATA_PDF if os.path.exists(DATA_PDF) else None
+        return status, summary, csvf, jsonf, pdff
+    scrape_btn.click(on_scrape, inputs=[url_in, instr, force_chk], outputs=[result_txt, summary_out, download_csv, download_json, download_pdf])
+    qa_btn.click(fn=do_qa, inputs=[qa_question, model_choice], outputs=[qa_answer])
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)