Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
c2b2088
1
Parent(s):
aea4c94
Update title extraction
Browse files
app.py
CHANGED
|
@@ -90,8 +90,15 @@ async def extract_text_pdf(id_doc: str):
|
|
| 90 |
postprocess_text = remove_in_betweens(postprocess_text)
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 93 |
-
titles =
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
else:
|
| 96 |
print("ID: " + id_doc)
|
| 97 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|
|
|
|
| 90 |
postprocess_text = remove_in_betweens(postprocess_text)
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 93 |
+
titles = doc.get_toc()
|
| 94 |
+
main_titles = []
|
| 95 |
+
if len(titles) <= 0:
|
| 96 |
+
main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
|
| 97 |
+
else:
|
| 98 |
+
for title in titles:
|
| 99 |
+
if title[0] == 1:
|
| 100 |
+
main_titles.append(title[1])
|
| 101 |
+
return {"message": main_titles, "pub_id": id_doc, "error": False}
|
| 102 |
else:
|
| 103 |
print("ID: " + id_doc)
|
| 104 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|