Spaces:
Sleeping
Sleeping
refresh
Browse files- .gitignore +2 -1
- app.py +17 -8
- main.py +28 -9
.gitignore
CHANGED
|
@@ -2,4 +2,5 @@
|
|
| 2 |
*.json
|
| 3 |
data
|
| 4 |
.ipynb_checkpoints
|
| 5 |
-
__pycache__
|
|
|
|
|
|
| 2 |
*.json
|
| 3 |
data
|
| 4 |
.ipynb_checkpoints
|
| 5 |
+
__pycache__
|
| 6 |
+
.sesskey
|
app.py
CHANGED
|
@@ -27,14 +27,13 @@ app, rt = fast_app(html_style=(style,))
|
|
| 27 |
login(token=os.environ.get("HF_TOKEN"))
|
| 28 |
|
| 29 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 30 |
-
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-
|
| 31 |
-
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-
|
| 32 |
|
| 33 |
abstract_ds = load_dataset(HF_REPO_ID_TXT, "abstracts", split="train")
|
| 34 |
article_ds = load_dataset(HF_REPO_ID_TXT, "articles", split="train")
|
| 35 |
|
| 36 |
-
image_ds = load_dataset(HF_REPO_ID_IMG, "
|
| 37 |
-
image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
|
| 38 |
|
| 39 |
|
| 40 |
def parse_date(date_string):
|
|
@@ -56,11 +55,21 @@ for article in article_ds:
|
|
| 56 |
|
| 57 |
weeks = sorted(week2articles.keys(), reverse=True)
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def get_article_details(arxiv_id):
|
| 61 |
-
article =
|
| 62 |
-
abstract =
|
| 63 |
-
image =
|
| 64 |
return article, abstract, image
|
| 65 |
|
| 66 |
|
|
@@ -103,7 +112,7 @@ def generate_week_content(current_week):
|
|
| 103 |
]
|
| 104 |
|
| 105 |
if image:
|
| 106 |
-
pil_image = image[0]["image"]
|
| 107 |
img_byte_arr = BytesIO()
|
| 108 |
pil_image.save(img_byte_arr, format="JPEG")
|
| 109 |
img_byte_arr = img_byte_arr.getvalue()
|
|
|
|
| 27 |
login(token=os.environ.get("HF_TOKEN"))
|
| 28 |
|
| 29 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 30 |
+
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
|
| 31 |
+
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
|
| 32 |
|
| 33 |
abstract_ds = load_dataset(HF_REPO_ID_TXT, "abstracts", split="train")
|
| 34 |
article_ds = load_dataset(HF_REPO_ID_TXT, "articles", split="train")
|
| 35 |
|
| 36 |
+
image_ds = load_dataset(HF_REPO_ID_IMG, "images_first_page", split="train")
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def parse_date(date_string):
|
|
|
|
| 55 |
|
| 56 |
weeks = sorted(week2articles.keys(), reverse=True)
|
| 57 |
|
| 58 |
+
arxiv2article = {article["arxiv_id"]: article for article in article_ds}
|
| 59 |
+
arxiv2abstract = {abstract["arxiv_id"]: abstract for abstract in abstract_ds}
|
| 60 |
+
arxiv2image = {image["arxiv_id"]: image for image in image_ds}
|
| 61 |
+
|
| 62 |
+
# def get_article_details(arxiv_id):
|
| 63 |
+
# article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
|
| 64 |
+
# abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
| 65 |
+
# image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
| 66 |
+
# return article, abstract, image
|
| 67 |
+
|
| 68 |
|
| 69 |
def get_article_details(arxiv_id):
|
| 70 |
+
article = arxiv2article.get(arxiv_id, {})
|
| 71 |
+
abstract = arxiv2abstract.get(arxiv_id, {})
|
| 72 |
+
image = arxiv2image.get(arxiv_id, {})
|
| 73 |
return article, abstract, image
|
| 74 |
|
| 75 |
|
|
|
|
| 112 |
]
|
| 113 |
|
| 114 |
if image:
|
| 115 |
+
pil_image = image["image"] # image[0]["image"]
|
| 116 |
img_byte_arr = BytesIO()
|
| 117 |
pil_image.save(img_byte_arr, format="JPEG")
|
| 118 |
img_byte_arr = img_byte_arr.getvalue()
|
main.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
| 3 |
import time
|
| 4 |
|
| 5 |
import dotenv
|
|
@@ -19,8 +20,8 @@ dotenv.load_dotenv()
|
|
| 19 |
login(token=os.environ.get("HF_TOKEN"))
|
| 20 |
|
| 21 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 22 |
-
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-
|
| 23 |
-
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-
|
| 24 |
|
| 25 |
|
| 26 |
########################################################
|
|
@@ -67,7 +68,7 @@ def get_zotero_items(debug=False):
|
|
| 67 |
print(f"# items fetched {len(items)}")
|
| 68 |
|
| 69 |
if debug:
|
| 70 |
-
if len(items) >
|
| 71 |
break
|
| 72 |
|
| 73 |
return items
|
|
@@ -334,7 +335,7 @@ def download_arxiv_pdf(arxiv_id):
|
|
| 334 |
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
|
| 335 |
|
| 336 |
|
| 337 |
-
def pdf_to_jpegs(pdf_content, output_folder):
|
| 338 |
# Create output folder if it doesn't exist
|
| 339 |
os.makedirs(output_folder, exist_ok=True)
|
| 340 |
|
|
@@ -353,6 +354,9 @@ def pdf_to_jpegs(pdf_content, output_folder):
|
|
| 353 |
pix.save(image_path)
|
| 354 |
# print(f"Saved {image_path}")
|
| 355 |
|
|
|
|
|
|
|
|
|
|
| 356 |
doc.close()
|
| 357 |
|
| 358 |
|
|
@@ -444,6 +448,13 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
|
| 444 |
except Exception as e:
|
| 445 |
print(e)
|
| 446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
try:
|
| 448 |
# push id_to_abstract
|
| 449 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
|
@@ -479,11 +490,8 @@ def main():
|
|
| 479 |
existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
|
| 480 |
except Exception as e:
|
| 481 |
print(e)
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
except Exception as e:
|
| 485 |
-
print(e)
|
| 486 |
-
existing_arxiv_ids = []
|
| 487 |
existing_arxiv_ids = set(existing_arxiv_ids)
|
| 488 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
| 489 |
|
|
@@ -492,9 +500,20 @@ def main():
|
|
| 492 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
| 493 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
processed_arxiv_ids = set()
|
| 496 |
pbar = tqdm(range(len(arxiv_items)))
|
| 497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
for item in arxiv_items:
|
| 499 |
# download images --
|
| 500 |
save_arxiv_article_images(item["arxiv_id"])
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
import shutil
|
| 4 |
import time
|
| 5 |
|
| 6 |
import dotenv
|
|
|
|
| 20 |
login(token=os.environ.get("HF_TOKEN"))
|
| 21 |
|
| 22 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 23 |
+
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
|
| 24 |
+
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
|
| 25 |
|
| 26 |
|
| 27 |
########################################################
|
|
|
|
| 68 |
print(f"# items fetched {len(items)}")
|
| 69 |
|
| 70 |
if debug:
|
| 71 |
+
if len(items) > 1600:
|
| 72 |
break
|
| 73 |
|
| 74 |
return items
|
|
|
|
| 335 |
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
|
| 336 |
|
| 337 |
|
| 338 |
+
def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
|
| 339 |
# Create output folder if it doesn't exist
|
| 340 |
os.makedirs(output_folder, exist_ok=True)
|
| 341 |
|
|
|
|
| 354 |
pix.save(image_path)
|
| 355 |
# print(f"Saved {image_path}")
|
| 356 |
|
| 357 |
+
if page_num >= max_pages:
|
| 358 |
+
break
|
| 359 |
+
|
| 360 |
doc.close()
|
| 361 |
|
| 362 |
|
|
|
|
| 448 |
except Exception as e:
|
| 449 |
print(e)
|
| 450 |
|
| 451 |
+
# upload first pages only
|
| 452 |
+
try:
|
| 453 |
+
img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
|
| 454 |
+
img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
|
| 455 |
+
except Exception as e:
|
| 456 |
+
print(e)
|
| 457 |
+
|
| 458 |
try:
|
| 459 |
# push id_to_abstract
|
| 460 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
|
|
|
| 490 |
existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
|
| 491 |
except Exception as e:
|
| 492 |
print(e)
|
| 493 |
+
existing_arxiv_ids = []
|
| 494 |
+
|
|
|
|
|
|
|
|
|
|
| 495 |
existing_arxiv_ids = set(existing_arxiv_ids)
|
| 496 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
| 497 |
|
|
|
|
| 500 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
| 501 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 502 |
|
| 503 |
+
if len(arxiv_items) == 0:
|
| 504 |
+
print("No new arxiv items to process")
|
| 505 |
+
return
|
| 506 |
+
|
| 507 |
processed_arxiv_ids = set()
|
| 508 |
pbar = tqdm(range(len(arxiv_items)))
|
| 509 |
|
| 510 |
+
# remove "data" directory if it exists
|
| 511 |
+
if os.path.exists("data"):
|
| 512 |
+
try:
|
| 513 |
+
shutil.rmtree("data")
|
| 514 |
+
except Exception as e:
|
| 515 |
+
print(e)
|
| 516 |
+
|
| 517 |
for item in arxiv_items:
|
| 518 |
# download images --
|
| 519 |
save_arxiv_article_images(item["arxiv_id"])
|