Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -306,7 +306,16 @@ def inference(input_batch,isurl,use_archive,filt_companies_topic,limit_companies
|
|
| 306 |
extracted = extract_content(requests.get(url).content)
|
| 307 |
input_batch_content.append(extracted)
|
| 308 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
input_batch_content.append(extracted)
|
| 311 |
else:
|
| 312 |
print("[i] Data is news contents")
|
|
@@ -349,7 +358,7 @@ def inference(input_batch,isurl,use_archive,filt_companies_topic,limit_companies
|
|
| 349 |
if ner_labels[idx]: #not empty
|
| 350 |
for ner in ner_labels[idx]:
|
| 351 |
if filt_companies_topic:
|
| 352 |
-
if news_sectors[idx]
|
| 353 |
continue
|
| 354 |
dfo = pd.concat( [dfo, df.loc[[idx]].assign(company=ner[0], sector=ner[1], symbol=ner[2])], join='outer', ignore_index=True) #axis=0
|
| 355 |
print("[i] Pandas output shape:",dfo.shape)
|
|
|
|
| 306 |
extracted = extract_content(requests.get(url).content)
|
| 307 |
input_batch_content.append(extracted)
|
| 308 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
| 309 |
+
try:
|
| 310 |
+
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
|
| 311 |
+
except:
|
| 312 |
+
archive = is_in_archive(url)
|
| 313 |
+
if archive['archived']:
|
| 314 |
+
print("[W] Using archive.org version of",url)
|
| 315 |
+
url = archive['url']
|
| 316 |
+
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
|
| 317 |
+
else:
|
| 318 |
+
print("[E] URL=",url,"not found")
|
| 319 |
input_batch_content.append(extracted)
|
| 320 |
else:
|
| 321 |
print("[i] Data is news contents")
|
|
|
|
| 358 |
if ner_labels[idx]: #not empty
|
| 359 |
for ner in ner_labels[idx]:
|
| 360 |
if filt_companies_topic:
|
| 361 |
+
if news_sectors[idx][0] not in ner[1]:
|
| 362 |
continue
|
| 363 |
dfo = pd.concat( [dfo, df.loc[[idx]].assign(company=ner[0], sector=ner[1], symbol=ner[2])], join='outer', ignore_index=True) #axis=0
|
| 364 |
print("[i] Pandas output shape:",dfo.shape)
|