Spaces:

NeuML
/

articlesummary

Running

davidmezzetti commited on Nov 15, 2021

Commit

a4aec71

1 Parent(s): 3dd67db

Create textractor.py

Files changed (1) hide show

textractor.py ADDED Viewed

+"""
+Textractor module
+"""
+import requests
+from bs4 import BeautifulSoup
+from txtai.pipeline.segmentation import Segmentation
+class Textractor(Segmentation):
+    """
+    Extracts text from files.
+    """
+    def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
+        super().__init__(sentences, lines, paragraphs, minlength, join)
+    def text(self, text):
+        # text is a url
+        response = requests.get(text)
+        html = response.text
+        soup = BeautifulSoup(html, features="html.parser")
+        return soup.get_text()