Spaces:
Sleeping
Sleeping
refresh
Browse files
main.py
CHANGED
|
@@ -66,7 +66,7 @@ def get_zotero_items(debug=False):
|
|
| 66 |
print(f"# items fetched {len(items)}")
|
| 67 |
|
| 68 |
if debug:
|
| 69 |
-
if len(items) >
|
| 70 |
break
|
| 71 |
|
| 72 |
return items
|
|
@@ -153,9 +153,7 @@ def parse_html_content(html):
|
|
| 153 |
|
| 154 |
# Extract paper title
|
| 155 |
try:
|
| 156 |
-
paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(
|
| 157 |
-
strip=True
|
| 158 |
-
)
|
| 159 |
except Exception:
|
| 160 |
paper_title = soup.find("title").get_text(strip=True)
|
| 161 |
paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
|
|
@@ -170,9 +168,7 @@ def parse_html_content(html):
|
|
| 170 |
if abstract:
|
| 171 |
result.append(
|
| 172 |
{
|
| 173 |
-
"content": " ".join(
|
| 174 |
-
p.get_text(strip=True) for p in abstract.find_all("p")
|
| 175 |
-
).replace(")", ") "),
|
| 176 |
"title": "Abstract",
|
| 177 |
"paper_title": paper_title,
|
| 178 |
"content_type": "abstract",
|
|
@@ -182,11 +178,7 @@ def parse_html_content(html):
|
|
| 182 |
sections = soup.find_all("section", class_="ltx_section")
|
| 183 |
for index, section in enumerate(sections):
|
| 184 |
section_title = section.find("h2", class_="ltx_title ltx_title_section")
|
| 185 |
-
section_title = (
|
| 186 |
-
section_title.get_text(strip=True)
|
| 187 |
-
if section_title
|
| 188 |
-
else f"Section {index + 1}"
|
| 189 |
-
)
|
| 190 |
section_content = section.get_text(strip=True).replace(")", ") ")
|
| 191 |
|
| 192 |
content_type = "body"
|
|
@@ -281,9 +273,7 @@ def parse_markdown_content(md_content, arxiv_id):
|
|
| 281 |
"content": " ".join(content),
|
| 282 |
"title": current_title,
|
| 283 |
"paper_title": paper_title,
|
| 284 |
-
"content_type": get_content_type(
|
| 285 |
-
current_section, len(parsed)
|
| 286 |
-
),
|
| 287 |
"arxiv_id": arxiv_id,
|
| 288 |
}
|
| 289 |
)
|
|
@@ -393,13 +383,7 @@ def create_hf_image_dataset(base_dir):
|
|
| 393 |
|
| 394 |
# Add the data
|
| 395 |
data.append(
|
| 396 |
-
{
|
| 397 |
-
"image": image_path,
|
| 398 |
-
"arxiv_id": arxiv_id,
|
| 399 |
-
"page_number": page_number,
|
| 400 |
-
"width": width,
|
| 401 |
-
"height": height,
|
| 402 |
-
}
|
| 403 |
)
|
| 404 |
|
| 405 |
# Create the dataset
|
|
@@ -435,23 +419,24 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
|
| 435 |
)
|
| 436 |
|
| 437 |
# upload image dataset
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN")
|
| 454 |
-
|
|
|
|
| 455 |
|
| 456 |
|
| 457 |
########################################################
|
|
@@ -467,9 +452,7 @@ def main():
|
|
| 467 |
|
| 468 |
# get already processed arxiv ids from HF
|
| 469 |
try:
|
| 470 |
-
existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"][
|
| 471 |
-
"arxiv_id"
|
| 472 |
-
]
|
| 473 |
except Exception as e:
|
| 474 |
print(e)
|
| 475 |
try:
|
|
@@ -481,9 +464,7 @@ def main():
|
|
| 481 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
| 482 |
|
| 483 |
# new arxiv items
|
| 484 |
-
arxiv_items = [
|
| 485 |
-
item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids
|
| 486 |
-
]
|
| 487 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
| 488 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 489 |
|
|
@@ -521,11 +502,7 @@ def main():
|
|
| 521 |
id_to_abstract[item["arxiv_id"]] = entry["content"]
|
| 522 |
break
|
| 523 |
print(f"# of abstracts: {len(id_to_abstract)}")
|
| 524 |
-
abstract_df = (
|
| 525 |
-
pd.Series(id_to_abstract)
|
| 526 |
-
.reset_index()
|
| 527 |
-
.rename(columns={"index": "arxiv_id", 0: "abstract"})
|
| 528 |
-
)
|
| 529 |
print(abstract_df.head())
|
| 530 |
|
| 531 |
# add to existing dataset
|
|
@@ -537,9 +514,7 @@ def main():
|
|
| 537 |
print(old_abstract_df.head())
|
| 538 |
|
| 539 |
abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
|
| 540 |
-
abstract_df = abstract_df.drop_duplicates(
|
| 541 |
-
subset=["arxiv_id"], keep="last"
|
| 542 |
-
).reset_index(drop=True)
|
| 543 |
|
| 544 |
# contents
|
| 545 |
contents_df = pd.DataFrame(arxiv_items)
|
|
@@ -553,9 +528,7 @@ def main():
|
|
| 553 |
print(old_contents_df.sample().T)
|
| 554 |
|
| 555 |
contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
|
| 556 |
-
contents_df = contents_df.drop_duplicates(
|
| 557 |
-
subset=["arxiv_id"], keep="last"
|
| 558 |
-
).reset_index(drop=True)
|
| 559 |
|
| 560 |
# upload to hf
|
| 561 |
processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
|
|
|
|
| 66 |
print(f"# items fetched {len(items)}")
|
| 67 |
|
| 68 |
if debug:
|
| 69 |
+
if len(items) > 500:
|
| 70 |
break
|
| 71 |
|
| 72 |
return items
|
|
|
|
| 153 |
|
| 154 |
# Extract paper title
|
| 155 |
try:
|
| 156 |
+
paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
|
|
|
|
|
|
|
| 157 |
except Exception:
|
| 158 |
paper_title = soup.find("title").get_text(strip=True)
|
| 159 |
paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
|
|
|
|
| 168 |
if abstract:
|
| 169 |
result.append(
|
| 170 |
{
|
| 171 |
+
"content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
|
|
|
|
|
|
|
| 172 |
"title": "Abstract",
|
| 173 |
"paper_title": paper_title,
|
| 174 |
"content_type": "abstract",
|
|
|
|
| 178 |
sections = soup.find_all("section", class_="ltx_section")
|
| 179 |
for index, section in enumerate(sections):
|
| 180 |
section_title = section.find("h2", class_="ltx_title ltx_title_section")
|
| 181 |
+
section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
section_content = section.get_text(strip=True).replace(")", ") ")
|
| 183 |
|
| 184 |
content_type = "body"
|
|
|
|
| 273 |
"content": " ".join(content),
|
| 274 |
"title": current_title,
|
| 275 |
"paper_title": paper_title,
|
| 276 |
+
"content_type": get_content_type(current_section, len(parsed)),
|
|
|
|
|
|
|
| 277 |
"arxiv_id": arxiv_id,
|
| 278 |
}
|
| 279 |
)
|
|
|
|
| 383 |
|
| 384 |
# Add the data
|
| 385 |
data.append(
|
| 386 |
+
{"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
)
|
| 388 |
|
| 389 |
# Create the dataset
|
|
|
|
| 419 |
)
|
| 420 |
|
| 421 |
# upload image dataset
|
| 422 |
+
try:
|
| 423 |
+
img_ds = create_hf_image_dataset("data/arxiv_images")
|
| 424 |
+
img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
|
| 425 |
+
|
| 426 |
+
# push id_to_abstract
|
| 427 |
+
abstract_ds = Dataset.from_pandas(abstract_df)
|
| 428 |
+
abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
|
| 429 |
+
|
| 430 |
+
# push arxiv_items
|
| 431 |
+
arxiv_ds = Dataset.from_pandas(contents_df)
|
| 432 |
+
arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
|
| 433 |
+
|
| 434 |
+
# push processed_arxiv_ids
|
| 435 |
+
processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
|
| 436 |
+
processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
|
| 437 |
+
processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
|
| 438 |
+
except Exception as e:
|
| 439 |
+
print(e)
|
| 440 |
|
| 441 |
|
| 442 |
########################################################
|
|
|
|
| 452 |
|
| 453 |
# get already processed arxiv ids from HF
|
| 454 |
try:
|
| 455 |
+
existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
|
|
|
|
|
|
|
| 456 |
except Exception as e:
|
| 457 |
print(e)
|
| 458 |
try:
|
|
|
|
| 464 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
| 465 |
|
| 466 |
# new arxiv items
|
| 467 |
+
arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
|
|
|
|
|
|
|
| 468 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
| 469 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 470 |
|
|
|
|
| 502 |
id_to_abstract[item["arxiv_id"]] = entry["content"]
|
| 503 |
break
|
| 504 |
print(f"# of abstracts: {len(id_to_abstract)}")
|
| 505 |
+
abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
print(abstract_df.head())
|
| 507 |
|
| 508 |
# add to existing dataset
|
|
|
|
| 514 |
print(old_abstract_df.head())
|
| 515 |
|
| 516 |
abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
|
| 517 |
+
abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
|
|
|
|
|
|
|
| 518 |
|
| 519 |
# contents
|
| 520 |
contents_df = pd.DataFrame(arxiv_items)
|
|
|
|
| 528 |
print(old_contents_df.sample().T)
|
| 529 |
|
| 530 |
contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
|
| 531 |
+
contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
|
|
|
|
|
|
|
| 532 |
|
| 533 |
# upload to hf
|
| 534 |
processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
|