Payam75 commited on
Commit
e600655
·
verified ·
1 Parent(s): 760325d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -95
app.py CHANGED
@@ -1,104 +1,34 @@
1
- import os
2
- from typing import List, Tuple, Optional
3
  import gradio as gr
4
- import markdown2
5
- from tqdm import tqdm
6
- from utils import normalize_url, scrape_with_bs
7
 
8
- # newspaper3k
9
- USE_NEWSPAPER_DEFAULT = False
10
- try:
11
- from newspaper import Article
12
- NEWSPAPER_AVAILABLE = True
13
- except Exception:
14
- NEWSPAPER_AVAILABLE = False
15
 
16
- # Transformers summarization
17
- SUMM_MODELS = ["None (no summarization)", "sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn"]
18
- _pipeline_cache = {}
19
 
20
- def get_summarizer(model_name: str):
21
- if model_name in ("None (no summarization)", None):
22
- return None
23
- if model_name in _pipeline_cache:
24
- return _pipeline_cache[model_name]
25
- from transformers import pipeline
26
- summarizer = pipeline("summarization", model=model_name)
27
- _pipeline_cache[model_name] = summarizer
28
- return summarizer
29
 
30
- def summarize_text(text: str, summarizer, max_chars=6000) -> str:
31
- if not summarizer:
32
- return ""
33
- text = text.strip()
34
- if len(text) > max_chars:
35
- text = text[:max_chars]
36
- if len(text.split()) < 25:
37
- return text
38
- out = summarizer(text, max_length=180, min_length=60, do_sample=False, truncation=True)
39
- return out[0]["summary_text"].strip()
40
 
41
- def scrape_single_url(url: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str]:
42
- url = normalize_url(url)
43
- title = None
44
- md_text = ""
45
- if use_newspaper and NEWSPAPER_AVAILABLE:
46
- article = Article(url)
47
- article.download()
48
- article.parse()
49
- title = article.title or None
50
- paragraphs = [p.strip() for p in article.text.split("\n") if p.strip()]
51
- md_body = "\n\n".join(paragraphs)
52
- md_text = f"# {title or url}\n\n{md_body}\n"
53
- else:
54
- md_body, title = scrape_with_bs(url, remove_images=remove_images)
55
- md_text = f"# {title or url}\n\n{md_body}\n"
56
- summarizer = get_summarizer(summarizer_model)
57
- summary = summarize_text(md_text, summarizer) if summarizer else ""
58
- if summary:
59
- header = f"**Summary:**\n\n> {summary}\n\n---\n\n"
60
- else:
61
- header = ""
62
- final_md = f"{header}{md_text}\n---\n"
63
- html_preview = md.markdown(final_md, extensions=["fenced_code","tables"])
64
- return final_md, html_preview
65
 
66
- def process_urls(urls_text: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str,str]:
67
- urls: List[str] = [u.strip() for u in urls_text.splitlines() if u.strip()]
68
- if not urls:
69
- return "Please provide at least one URL.", "", ""
70
- combined_md_parts = []
71
- combined_html_parts = []
72
- for url in tqdm(urls):
73
- try:
74
- md_text, html_preview = scrape_single_url(url, remove_images, use_newspaper, summarizer_model)
75
- combined_md_parts.append(md_text)
76
- combined_html_parts.append(html_preview)
77
- except Exception as e:
78
- combined_md_parts.append(f"# {url}\n\n**Error:** {e}\n---\n")
79
- combined_md = "\n".join(combined_md_parts).strip()
80
- combined_html = "\n".join(combined_html_parts).strip()
81
- out_path = os.path.abspath("output.md")
82
- with open(out_path,"w",encoding="utf-8") as f:
83
- f.write(combined_md)
84
- return combined_md, combined_html, out_path
85
 
86
- with gr.Blocks(title="Web → Markdown Scraper") as demo:
87
- gr.Markdown("# 🌐 Web → Markdown Scraper with Multi-URL and Summarization")
88
- with gr.Row():
89
- urls_box = gr.Textbox(label="Enter URLs (one per line)",lines=6,placeholder="https://example.com\nhttps://news.ycombinator.com")
90
- with gr.Row():
91
- remove_images = gr.Checkbox(label="Remove Images", value=False)
92
- use_newspaper = gr.Checkbox(label=f"Extract Main Article (newspaper3k){'' if NEWSPAPER_AVAILABLE else ' [unavailable]'}",
93
- value=USE_NEWSPAPER_DEFAULT and NEWSPAPER_AVAILABLE,
94
- interactive=NEWSPAPER_AVAILABLE)
95
- model_choice = gr.Dropdown(label="Summarization Model", choices=SUMM_MODELS, value=SUMM_MODELS[1], allow_custom_value=True)
96
- run_btn = gr.Button("Scrape & Convert", variant="primary")
97
- with gr.Row(equal_height=True):
98
- md_output = gr.Textbox(label="Markdown Output", lines=22)
99
- html_preview = gr.HTML(label="Preview (rendered)")
100
- download_file = gr.File(label="Download .md", interactive=False)
101
- run_btn.click(process_urls, inputs=[urls_box, remove_images, use_newspaper, model_choice],
102
- outputs=[md_output, html_preview, download_file])
103
- if __name__=="__main__":
104
  demo.launch()
 
 
 
1
  import gradio as gr
2
+ from scraper import scrape_to_markdown
 
 
3
 
4
+ def process_url(url: str):
5
+ try:
6
+ md_text, html_preview = scrape_to_markdown(url)
7
+ return md_text, html_preview, md_text
8
+ except Exception as e:
9
+ return f"❌ Error: {str(e)}", "<p style='color:red'>Error occurred</p>", ""
 
10
 
11
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
12
+ gr.Markdown("# 🌐 Web2Markdown Tool\nConvert any webpage into clean Markdown")
 
13
 
14
+ with gr.Row():
15
+ url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", scale=3)
16
+ fetch_btn = gr.Button("🚀 Fetch", scale=1)
 
 
 
 
 
 
17
 
18
+ with gr.Row():
19
+ md_output = gr.Code(label="Markdown Output", language="markdown")
20
+ html_output = gr.HTML(label="Preview (HTML)")
21
+
22
+ download_file = gr.File(label="Download Markdown (.md)", file_types=[".md"], interactive=False)
 
 
 
 
 
23
 
24
+ def handle_process(url):
25
+ md_text, html_preview, md_file = process_url(url)
26
+ filename = "output.md"
27
+ with open(filename, "w", encoding="utf-8") as f:
28
+ f.write(md_file)
29
+ return md_text, html_preview, filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ fetch_btn.click(handle_process, inputs=url_input, outputs=[md_output, html_output, download_file])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  demo.launch()