Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,7 +18,10 @@ except ImportError:
|
|
| 18 |
except ImportError:
|
| 19 |
try:
|
| 20 |
import trafilatura
|
|
|
|
| 21 |
EXTRACTOR_NET = 'trafilatura'
|
|
|
|
|
|
|
| 22 |
except ImportError:
|
| 23 |
raise ImportError
|
| 24 |
|
|
@@ -301,7 +304,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
|
|
| 301 |
extracted = extract_content(requests.get(url).content)
|
| 302 |
input_batch_content.append(extracted)
|
| 303 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
| 304 |
-
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False)
|
| 305 |
input_batch_content.append(extracted)
|
| 306 |
else:
|
| 307 |
print("[i] Data is news contents")
|
|
|
|
| 18 |
except ImportError:
|
| 19 |
try:
|
| 20 |
import trafilatura
|
| 21 |
+
from trafilatura.settings import use_config
|
| 22 |
EXTRACTOR_NET = 'trafilatura'
|
| 23 |
+
trafilatura_config = use_config()
|
| 24 |
+
trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0") #To avoid it runnig signals to avoid clashing with gradio threads
|
| 25 |
except ImportError:
|
| 26 |
raise ImportError
|
| 27 |
|
|
|
|
| 304 |
extracted = extract_content(requests.get(url).content)
|
| 305 |
input_batch_content.append(extracted)
|
| 306 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
| 307 |
+
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
|
| 308 |
input_batch_content.append(extracted)
|
| 309 |
else:
|
| 310 |
print("[i] Data is news contents")
|