Payam75 commited on
Commit
f5c57cb
·
verified ·
1 Parent(s): 70682a3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Tuple, Optional
3
+ import gradio as gr
4
+ import markdown as md
5
+ from tqdm import tqdm
6
+ from utils import normalize_url, scrape_with_bs
7
+
8
+ # newspaper3k
9
+ USE_NEWSPAPER_DEFAULT = False
10
+ try:
11
+ from newspaper import Article
12
+ NEWSPAPER_AVAILABLE = True
13
+ except Exception:
14
+ NEWSPAPER_AVAILABLE = False
15
+
16
+ # Transformers summarization
17
+ SUMM_MODELS = ["None (no summarization)", "sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn"]
18
+ _pipeline_cache = {}
19
+
20
+ def get_summarizer(model_name: str):
21
+ if model_name in ("None (no summarization)", None):
22
+ return None
23
+ if model_name in _pipeline_cache:
24
+ return _pipeline_cache[model_name]
25
+ from transformers import pipeline
26
+ summarizer = pipeline("summarization", model=model_name)
27
+ _pipeline_cache[model_name] = summarizer
28
+ return summarizer
29
+
30
+ def summarize_text(text: str, summarizer, max_chars=6000) -> str:
31
+ if not summarizer:
32
+ return ""
33
+ text = text.strip()
34
+ if len(text) > max_chars:
35
+ text = text[:max_chars]
36
+ if len(text.split()) < 25:
37
+ return text
38
+ out = summarizer(text, max_length=180, min_length=60, do_sample=False, truncation=True)
39
+ return out[0]["summary_text"].strip()
40
+
41
+ def scrape_single_url(url: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str]:
42
+ url = normalize_url(url)
43
+ title = None
44
+ md_text = ""
45
+ if use_newspaper and NEWSPAPER_AVAILABLE:
46
+ article = Article(url)
47
+ article.download()
48
+ article.parse()
49
+ title = article.title or None
50
+ paragraphs = [p.strip() for p in article.text.split("\n") if p.strip()]
51
+ md_body = "\n\n".join(paragraphs)
52
+ md_text = f"# {title or url}\n\n{md_body}\n"
53
+ else:
54
+ md_body, title = scrape_with_bs(url, remove_images=remove_images)
55
+ md_text = f"# {title or url}\n\n{md_body}\n"
56
+ summarizer = get_summarizer(summarizer_model)
57
+ summary = summarize_text(md_text, summarizer) if summarizer else ""
58
+ if summary:
59
+ header = f"**Summary:**\n\n> {summary}\n\n---\n\n"
60
+ else:
61
+ header = ""
62
+ final_md = f"{header}{md_text}\n---\n"
63
+ html_preview = md.markdown(final_md, extensions=["fenced_code","tables"])
64
+ return final_md, html_preview
65
+
66
+ def process_urls(urls_text: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str,str]:
67
+ urls: List[str] = [u.strip() for u in urls_text.splitlines() if u.strip()]
68
+ if not urls:
69
+ return "Please provide at least one URL.", "", ""
70
+ combined_md_parts = []
71
+ combined_html_parts = []
72
+ for url in tqdm(urls):
73
+ try:
74
+ md_text, html_preview = scrape_single_url(url, remove_images, use_newspaper, summarizer_model)
75
+ combined_md_parts.append(md_text)
76
+ combined_html_parts.append(html_preview)
77
+ except Exception as e:
78
+ combined_md_parts.append(f"# {url}\n\n**Error:** {e}\n---\n")
79
+ combined_md = "\n".join(combined_md_parts).strip()
80
+ combined_html = "\n".join(combined_html_parts).strip()
81
+ out_path = os.path.abspath("output.md")
82
+ with open(out_path,"w",encoding="utf-8") as f:
83
+ f.write(combined_md)
84
+ return combined_md, combined_html, out_path
85
+
86
+ with gr.Blocks(title="Web → Markdown Scraper") as demo:
87
+ gr.Markdown("# 🌐 Web → Markdown Scraper with Multi-URL and Summarization")
88
+ with gr.Row():
89
+ urls_box = gr.Textbox(label="Enter URLs (one per line)",lines=6,placeholder="https://example.com\nhttps://news.ycombinator.com")
90
+ with gr.Row():
91
+ remove_images = gr.Checkbox(label="Remove Images", value=False)
92
+ use_newspaper = gr.Checkbox(label=f"Extract Main Article (newspaper3k){'' if NEWSPAPER_AVAILABLE else ' [unavailable]'}",
93
+ value=USE_NEWSPAPER_DEFAULT and NEWSPAPER_AVAILABLE,
94
+ interactive=NEWSPAPER_AVAILABLE)
95
+ model_choice = gr.Dropdown(label="Summarization Model", choices=SUMM_MODELS, value=SUMM_MODELS[1], allow_custom_value=True)
96
+ run_btn = gr.Button("Scrape & Convert", variant="primary")
97
+ with gr.Row(equal_height=True):
98
+ md_output = gr.Textbox(label="Markdown Output", lines=22)
99
+ html_preview = gr.HTML(label="Preview (rendered)")
100
+ download_file = gr.File(label="Download .md", interactive=False)
101
+ run_btn.click(process_urls, inputs=[urls_box, remove_images, use_newspaper, model_choice],
102
+ outputs=[md_output, html_preview, download_file])
103
+ if __name__=="__main__":
104
+ demo.launch()