Spaces:

Payam75
/

WEB2MARKDOWN

Sleeping

App Files Files Community

Payam75 commited on Aug 23, 2025

Commit

e600655

verified ·

1 Parent(s): 760325d

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -95

app.py CHANGED Viewed

@@ -1,104 +1,34 @@
-import os
-from typing import List, Tuple, Optional
 import gradio as gr
-import markdown2
-from tqdm import tqdm
-from utils import normalize_url, scrape_with_bs
-# newspaper3k
-USE_NEWSPAPER_DEFAULT = False
-try:
-    from newspaper import Article
-    NEWSPAPER_AVAILABLE = True
-except Exception:
-    NEWSPAPER_AVAILABLE = False
-# Transformers summarization
-SUMM_MODELS = ["None (no summarization)", "sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn"]
-_pipeline_cache = {}
-def get_summarizer(model_name: str):
-    if model_name in ("None (no summarization)", None):
-        return None
-    if model_name in _pipeline_cache:
-        return _pipeline_cache[model_name]
-    from transformers import pipeline
-    summarizer = pipeline("summarization", model=model_name)
-    _pipeline_cache[model_name] = summarizer
-    return summarizer
-def summarize_text(text: str, summarizer, max_chars=6000) -> str:
-    if not summarizer:
-        return ""
-    text = text.strip()
-    if len(text) > max_chars:
-        text = text[:max_chars]
-    if len(text.split()) < 25:
-        return text
-    out = summarizer(text, max_length=180, min_length=60, do_sample=False, truncation=True)
-    return out[0]["summary_text"].strip()
-def scrape_single_url(url: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str]:
-    url = normalize_url(url)
-    title = None
-    md_text = ""
-    if use_newspaper and NEWSPAPER_AVAILABLE:
-        article = Article(url)
-        article.download()
-        article.parse()
-        title = article.title or None
-        paragraphs = [p.strip() for p in article.text.split("\n") if p.strip()]
-        md_body = "\n\n".join(paragraphs)
-        md_text = f"# {title or url}\n\n{md_body}\n"
-    else:
-        md_body, title = scrape_with_bs(url, remove_images=remove_images)
-        md_text = f"# {title or url}\n\n{md_body}\n"
-    summarizer = get_summarizer(summarizer_model)
-    summary = summarize_text(md_text, summarizer) if summarizer else ""
-    if summary:
-        header = f"**Summary:**\n\n> {summary}\n\n---\n\n"
-    else:
-        header = ""
-    final_md = f"{header}{md_text}\n---\n"
-    html_preview = md.markdown(final_md, extensions=["fenced_code","tables"])
-    return final_md, html_preview
-def process_urls(urls_text: str, remove_images: bool, use_newspaper: bool, summarizer_model: str) -> Tuple[str,str,str]:
-    urls: List[str] = [u.strip() for u in urls_text.splitlines() if u.strip()]
-    if not urls:
-        return "Please provide at least one URL.", "", ""
-    combined_md_parts = []
-    combined_html_parts = []
-    for url in tqdm(urls):
-        try:
-            md_text, html_preview = scrape_single_url(url, remove_images, use_newspaper, summarizer_model)
-            combined_md_parts.append(md_text)
-            combined_html_parts.append(html_preview)
-        except Exception as e:
-            combined_md_parts.append(f"# {url}\n\n**Error:** {e}\n---\n")
-    combined_md = "\n".join(combined_md_parts).strip()
-    combined_html = "\n".join(combined_html_parts).strip()
-    out_path = os.path.abspath("output.md")
-    with open(out_path,"w",encoding="utf-8") as f:
-        f.write(combined_md)
-    return combined_md, combined_html, out_path
-with gr.Blocks(title="Web â†’ Markdown Scraper") as demo:
-    gr.Markdown("# ðŸŒ Web â†’ Markdown Scraper with Multi-URL and Summarization")
-    with gr.Row():
-        urls_box = gr.Textbox(label="Enter URLs (one per line)",lines=6,placeholder="https://example.com\nhttps://news.ycombinator.com")
-    with gr.Row():
-        remove_images = gr.Checkbox(label="Remove Images", value=False)
-        use_newspaper = gr.Checkbox(label=f"Extract Main Article (newspaper3k){'' if NEWSPAPER_AVAILABLE else ' [unavailable]'}",
-                                   value=USE_NEWSPAPER_DEFAULT and NEWSPAPER_AVAILABLE,
-                                   interactive=NEWSPAPER_AVAILABLE)
-        model_choice = gr.Dropdown(label="Summarization Model", choices=SUMM_MODELS, value=SUMM_MODELS[1], allow_custom_value=True)
-    run_btn = gr.Button("Scrape & Convert", variant="primary")
-    with gr.Row(equal_height=True):
-        md_output = gr.Textbox(label="Markdown Output", lines=22)
-        html_preview = gr.HTML(label="Preview (rendered)")
-    download_file = gr.File(label="Download .md", interactive=False)
-    run_btn.click(process_urls, inputs=[urls_box, remove_images, use_newspaper, model_choice],
-                  outputs=[md_output, html_preview, download_file])
-if __name__=="__main__":
     demo.launch()

 import gradio as gr
+from scraper import scrape_to_markdown
+def process_url(url: str):
+    try:
+        md_text, html_preview = scrape_to_markdown(url)
+        return md_text, html_preview, md_text
+    except Exception as e:
+        return f"❌ Error: {str(e)}", "<p style='color:red'>Error occurred</p>", ""
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🌐 Web2Markdown Tool\nConvert any webpage into clean Markdown")
+    with gr.Row():
+        url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", scale=3)
+        fetch_btn = gr.Button("🚀 Fetch", scale=1)
+    with gr.Row():
+        md_output = gr.Code(label="Markdown Output", language="markdown")
+        html_output = gr.HTML(label="Preview (HTML)")
+    download_file = gr.File(label="Download Markdown (.md)", file_types=[".md"], interactive=False)
+    def handle_process(url):
+        md_text, html_preview, md_file = process_url(url)
+        filename = "output.md"
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(md_file)
+        return md_text, html_preview, filename
+    fetch_btn.click(handle_process, inputs=url_input, outputs=[md_output, html_output, download_file])
+if __name__ == "__main__":
     demo.launch()