Spaces:

Payam75
/

WEB2MARKDOWN

Sleeping

App Files Files Community

Payam75 commited on Aug 23, 2025

Commit

f5c57cb

verified ·

1 Parent(s): 70682a3

Create app.py

Browse files

Files changed (1) hide show

app.py +104 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+from typing import List, Tuple, Optional
+import gradio as gr
+import markdown as md
+from tqdm import tqdm
+from utils import normalize_url, scrape_with_bs
+# newspaper3k
+USE_NEWSPAPER_DEFAULT = False
+try:
+    from newspaper import Article
+    NEWSPAPER_AVAILABLE = True
+except Exception:
+    NEWSPAPER_AVAILABLE = False
+# Transformers summarization
+SUMM_MODELS = ["None (no summarization)", "sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn"]
+_pipeline_cache = {}
+def get_summarizer(model_name: str):
+    if model_name in ("None (no summarization)", None):
+        return None
+    if model_name in _pipeline_cache:
+        return _pipeline_cache[model_name]
+    from transformers import pipeline
+    summarizer = pipeline("summarization", model=model_name)
+    _pipeline_cache[model_name] = summarizer
+    return summarizer
+def summarize_text(text: str, summarizer, max_chars=6000) -> str:
+    if not summarizer:
+        return ""
+    text = text.strip()
+    if len(text) > max_chars:
+        text = text[:max_chars]
+    if len(text.split()) < 25:
+        return text
+    out = summarizer(text, max_length=180, min_length=60, do_sample=False, truncation=True)
+    return out[0]["summary_text"].strip()
+def scrape_single_url(url: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str]:
+    url = normalize_url(url)
+    title = None
+    md_text = ""
+    if use_newspaper and NEWSPAPER_AVAILABLE:
+        article = Article(url)
+        article.download()
+        article.parse()
+        title = article.title or None
+        paragraphs = [p.strip() for p in article.text.split("\n") if p.strip()]
+        md_body = "\n\n".join(paragraphs)
+        md_text = f"# {title or url}\n\n{md_body}\n"
+    else:
+        md_body, title = scrape_with_bs(url, remove_images=remove_images)
+        md_text = f"# {title or url}\n\n{md_body}\n"
+    summarizer = get_summarizer(summarizer_model)
+    summary = summarize_text(md_text, summarizer) if summarizer else ""
+    if summary:
+        header = f"**Summary:**\n\n> {summary}\n\n---\n\n"
+    else:
+        header = ""
+    final_md = f"{header}{md_text}\n---\n"
+    html_preview = md.markdown(final_md, extensions=["fenced_code","tables"])
+    return final_md, html_preview
+def process_urls(urls_text: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str,str]:
+    urls: List[str] = [u.strip() for u in urls_text.splitlines() if u.strip()]
+    if not urls:
+        return "Please provide at least one URL.", "", ""
+    combined_md_parts = []
+    combined_html_parts = []
+    for url in tqdm(urls):
+        try:
+            md_text, html_preview = scrape_single_url(url, remove_images, use_newspaper, summarizer_model)
+            combined_md_parts.append(md_text)
+            combined_html_parts.append(html_preview)
+        except Exception as e:
+            combined_md_parts.append(f"# {url}\n\n**Error:** {e}\n---\n")
+    combined_md = "\n".join(combined_md_parts).strip()
+    combined_html = "\n".join(combined_html_parts).strip()
+    out_path = os.path.abspath("output.md")
+    with open(out_path,"w",encoding="utf-8") as f:
+        f.write(combined_md)
+    return combined_md, combined_html, out_path
+with gr.Blocks(title="Web â†’ Markdown Scraper") as demo:
+    gr.Markdown("# ðŸŒ Web â†’ Markdown Scraper with Multi-URL and Summarization")
+    with gr.Row():
+        urls_box = gr.Textbox(label="Enter URLs (one per line)",lines=6,placeholder="https://example.com\nhttps://news.ycombinator.com")
+    with gr.Row():
+        remove_images = gr.Checkbox(label="Remove Images", value=False)
+        use_newspaper = gr.Checkbox(label=f"Extract Main Article (newspaper3k){'' if NEWSPAPER_AVAILABLE else ' [unavailable]'}",
+                                   value=USE_NEWSPAPER_DEFAULT and NEWSPAPER_AVAILABLE,
+                                   interactive=NEWSPAPER_AVAILABLE)
+        model_choice = gr.Dropdown(label="Summarization Model", choices=SUMM_MODELS, value=SUMM_MODELS[1], allow_custom_value=True)
+    run_btn = gr.Button("Scrape & Convert", variant="primary")
+    with gr.Row(equal_height=True):
+        md_output = gr.Textbox(label="Markdown Output", lines=22)
+        html_preview = gr.HTML(label="Preview (rendered)")
+    download_file = gr.File(label="Download .md", interactive=False)
+    run_btn.click(process_urls, inputs=[urls_box, remove_images, use_newspaper, model_choice],
+                  outputs=[md_output, html_preview, download_file])
+if __name__=="__main__":
+    demo.launch()