mibrahimzia commited on
Commit
3e39a1c
·
verified ·
1 Parent(s): b84fcf0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -28
app.py CHANGED
@@ -1,40 +1,117 @@
 
 
 
 
 
1
  import gradio as gr
2
- from llm_utils import generate_scraper_instruction, answer_query
3
- from scraper import scrape_website
4
- from processor import refine_content
5
- from utils import export_data
6
 
7
- def run_pipeline(user_prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  try:
9
- refined_task = generate_scraper_instruction(user_prompt)
10
- scraped = scrape_website(refined_task["url"])
11
- clean_content = refine_content(scraped["content"], refined_task["intent"])
12
- export_paths = export_data(user_prompt, scraped, clean_content)
13
- return f"**Refined Instruction:** {refined_task}\n\n**Clean Content Preview:**\n{clean_content[:1000]}...", export_paths
14
  except Exception as e:
15
- return f" Error: {str(e)}", {}
16
-
17
- def qna_response(query, context):
18
- return answer_query(query, context)
 
 
 
 
 
 
 
 
19
 
20
- with gr.Blocks(title="AI-Assisted Data Pipeline") as demo:
21
- gr.Markdown("## 🌐 AI-Assisted Web Data Pipeline\nType your query and let AI handle the web data extraction and analysis.")
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- user_prompt = gr.Textbox(label="Your request", placeholder="e.g., Get recent AI startup funding news")
24
- run_btn = gr.Button("Run Pipeline")
 
 
 
 
 
 
 
25
 
26
- result_output = gr.Markdown()
27
- files_output = gr.File(label="Download Processed Data")
 
 
 
 
 
 
 
 
 
28
 
29
- context_state = gr.State()
 
 
 
30
 
31
- run_btn.click(fn=run_pipeline, inputs=[user_prompt], outputs=[result_output, files_output], show_progress="full")
 
 
 
 
32
 
33
- gr.Markdown("### 🧩 Q&A on Extracted Data")
34
- question = gr.Textbox(label="Ask a question about the scraped content")
35
- answer_box = gr.Textbox(label="AI Answer", interactive=False)
36
- ask_btn = gr.Button("Ask")
 
 
 
37
 
38
- ask_btn.click(fn=qna_response, inputs=[question, context_state], outputs=answer_box)
 
39
 
40
- demo.launch()
 
 
1
+ # app.py
2
+ import os
3
+ import json
4
+ import pandas as pd
5
+ from fpdf import FPDF
6
  import gradio as gr
7
+ from scraper import scrape
8
+ from qa_openrouter import ask_openrouter
 
 
9
 
10
+ DATA_JSON = "scraped_data.json"
11
+ DATA_CSV = "scraped_data.csv"
12
+ DATA_PDF = "scraped_data.pdf"
13
+
14
+ def save_data(data: dict):
15
+ # Flatten a minimal tabular form for CSV: title, url, first_heading, excerpt
16
+ rows = [{
17
+ "url": data.get("url",""),
18
+ "title": data.get("title",""),
19
+ "first_heading": (data.get("headings") or [""])[0] if data.get("headings") else "",
20
+ "excerpt": (data.get("text") or "")[:500]
21
+ }]
22
+ df = pd.DataFrame(rows)
23
+ df.to_csv(DATA_CSV, index=False)
24
+ with open(DATA_JSON, "w", encoding="utf-8") as f:
25
+ json.dump(data, f, ensure_ascii=False, indent=2)
26
+ # Save PDF (very simple)
27
+ pdf = FPDF()
28
+ pdf.set_auto_page_break(auto=True, margin=12)
29
+ pdf.add_page()
30
+ pdf.set_font("Arial", size=12)
31
+ text = f"Title: {data.get('title','')}\nURL: {data.get('url','')}\n\n"
32
+ # include headings and excerpt
33
+ if data.get("headings"):
34
+ text += "Headings:\n" + "\n".join(data["headings"][:10]) + "\n\n"
35
+ text += "Excerpt:\n" + (data.get("text") or "")[:4000]
36
+ for line in text.split("\n"):
37
+ pdf.multi_cell(0, 6, line)
38
+ pdf.output(DATA_PDF)
39
+
40
+ def do_scrape(url: str, instruction: str, force_render: bool):
41
  try:
42
+ data = scrape(url, force_render=force_render)
 
 
 
 
43
  except Exception as e:
44
+ return f"Scrape failed: {e}", None
45
+ # optionally apply instruction: we will filter/collect based on simple instruction heuristics
46
+ # For minimal version, just save full data and return short summary
47
+ save_data(data)
48
+ summary = {
49
+ "title": data.get("title",""),
50
+ "url": data.get("url",""),
51
+ "headings_count": len(data.get("headings") or []),
52
+ "links_count": len(data.get("links") or []),
53
+ "text_chars": len(data.get("text") or "")
54
+ }
55
+ return "Scrape successful.", summary
56
 
57
+ def load_data_for_qa() -> str:
58
+ if not os.path.exists(DATA_JSON):
59
+ return ""
60
+ with open(DATA_JSON, "r", encoding="utf-8") as f:
61
+ data = json.load(f)
62
+ # Combine title, headings and text
63
+ parts = []
64
+ if data.get("title"):
65
+ parts.append(f"Title: {data['title']}")
66
+ if data.get("headings"):
67
+ parts.append("Headings:\n" + "\n".join(data["headings"]))
68
+ if data.get("text"):
69
+ parts.append("Content:\n" + data["text"])
70
+ return "\n\n".join(parts)
71
 
72
+ def do_qa(question: str, model: str = "openrouter/auto"):
73
+ ctx = load_data_for_qa()
74
+ if not ctx:
75
+ return "No scraped data available. Run a scrape first."
76
+ try:
77
+ answer = ask_openrouter(question, ctx, model=model)
78
+ return answer
79
+ except Exception as e:
80
+ return f"OpenRouter request failed: {e}"
81
 
82
+ # Build Gradio UI
83
+ with gr.Blocks() as demo:
84
+ gr.Markdown("# Minimal AI Scraper + Q/A (OpenRouter)\nEnter a URL and an instruction. Scrapes via Requests/BeautifulSoup and Playwright (fallback). Stores data and enables Q/A via OpenRouter.")
85
+ with gr.Row():
86
+ url_in = gr.Textbox(label="URL", placeholder="https://example.com", lines=1)
87
+ instr = gr.Textbox(label="Instruction (optional)", placeholder="E.g., 'Get article titles'", lines=1)
88
+ with gr.Row():
89
+ force_chk = gr.Checkbox(label="Force render with Playwright (JS render)", value=False)
90
+ scrape_btn = gr.Button("Scrape")
91
+ result_txt = gr.Textbox(label="Status", interactive=False)
92
+ summary_out = gr.JSON(label="Summary (saved data metadata)")
93
 
94
+ with gr.Row():
95
+ download_csv = gr.File(label="Download CSV", visible=True)
96
+ download_json = gr.File(label="Download JSON", visible=True)
97
+ download_pdf = gr.File(label="Download PDF", visible=True)
98
 
99
+ gr.Markdown("## Q/A about scraped data")
100
+ qa_question = gr.Textbox(label="Ask a question about the scraped data", lines=2)
101
+ model_choice = gr.Textbox(label="OpenRouter model (optional)", value="openrouter/auto", lines=1)
102
+ qa_btn = gr.Button("Ask")
103
+ qa_answer = gr.Textbox(label="Answer", interactive=False)
104
 
105
+ def on_scrape(url, instruction, force):
106
+ status, summary = do_scrape(url, instruction, force)
107
+ # update file components (they will point to local files)
108
+ csvf = DATA_CSV if os.path.exists(DATA_CSV) else None
109
+ jsonf = DATA_JSON if os.path.exists(DATA_JSON) else None
110
+ pdff = DATA_PDF if os.path.exists(DATA_PDF) else None
111
+ return status, summary, csvf, jsonf, pdff
112
 
113
+ scrape_btn.click(on_scrape, inputs=[url_in, instr, force_chk], outputs=[result_txt, summary_out, download_csv, download_json, download_pdf])
114
+ qa_btn.click(fn=do_qa, inputs=[qa_question, model_choice], outputs=[qa_answer])
115
 
116
+ if __name__ == "__main__":
117
+ demo.launch(server_name="0.0.0.0", server_port=7860)