Spaces:

orgoflu
/

moro_text

Sleeping

orgoflu commited on Sep 10, 2025

Commit

f03ed3b

verified ·

1 Parent(s): ddaa72b

app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,7 +7,14 @@ def extract(url):
     try:
         r = requests.get(url, headers=headers, timeout=10)
         r.raise_for_status()
-        text = trafilatura.extract(r.text)
         return text or "본문을 추출할 수 없습니다."
     except requests.exceptions.Timeout:
         return "요청이 시간 초과되었습니다."
@@ -19,9 +26,9 @@ def extract(url):
 iface = gr.Interface(
     fn=extract,
     inputs=gr.Textbox(label="URL 입력", placeholder="https://example.com"),
-    outputs=gr.Textbox(label="추출된 본문", lines=20),
     title="본문 추출기",
-    description="웹페이지 URL을 입력하면 본문만 추출합니다."
 )
 if __name__ == "__main__":

     try:
         r = requests.get(url, headers=headers, timeout=10)
         r.raise_for_status()
+        # full_text=True → 가능한 모든 텍스트 추출
+        text = trafilatura.extract(
+            r.text,
+            include_comments=False,
+            include_tables=True,
+            no_fallback=False,
+            favor_recall=True
+        )
         return text or "본문을 추출할 수 없습니다."
     except requests.exceptions.Timeout:
         return "요청이 시간 초과되었습니다."
 iface = gr.Interface(
     fn=extract,
     inputs=gr.Textbox(label="URL 입력", placeholder="https://example.com"),
+    outputs=gr.Textbox(label="추출된 본문", lines=30),
     title="본문 추출기",
+    description="웹페이지 URL을 입력하면 가능한 많은 본문을 추출합니다."
 )
 if __name__ == "__main__":