import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md

def extract(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()
        # HTML 형태로 추출
        html_content = trafilatura.extract(
            r.text,
            output_format="html",
            include_tables=True,
            favor_recall=True
        )
        if html_content:
            # HTML → Markdown 변환
            markdown_text = md(html_content, heading_style="ATX")
            return markdown_text
        return "본문을 추출할 수 없습니다."
    except requests.exceptions.Timeout:
        return "요청이 시간 초과되었습니다."
    except requests.exceptions.RequestException as e:
        return f"요청 실패: {e}"
    except Exception as e:
        return f"에러 발생: {e}"

iface = gr.Interface(
    fn=extract,
    inputs=gr.Textbox(label="URL 입력", placeholder="https://example.com"),
    outputs=gr.Markdown(label="추출된 본문"),
    title="본문 추출기 (마크다운 지원)",
    description="웹페이지 URL을 입력하면 리더모드처럼 깔끔하게 마크다운으로 추출합니다."
)

if __name__ == "__main__":
    iface.launch()