Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 26, 2025

Commit

14cda2c

verified ·

1 Parent(s): 9bea774

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -3

app.py CHANGED Viewed

@@ -7,6 +7,18 @@ from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
 def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
@@ -119,6 +131,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
     return {
         "filename": os.path.basename(pdf_path),
         "content": content
     }
@@ -171,7 +184,18 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
     print("🟡 Converting Processed Data to Parquet")
     # ✅ Step 2: Convert to Parquet
     df = pd.DataFrame(all_data)
-    parquet_file = 'fully_labeled_papers.parquet'
     try:
         df.to_parquet(parquet_file, engine='pyarrow', index=False)
@@ -248,5 +272,3 @@ with gr.Blocks() as demo:
     )
 demo.launch()

 from huggingface_hub.utils import HfHubHTTPError
 import time
+def sanitize_title(title, max_length=100):
+    """
+    Sanitize the paper title to be safe for use as a filename.
+    Removes non-alphanumeric characters (except underscores and hyphens)
+    and truncates to max_length characters.
+    """
+    sanitized = re.sub(r'[^\w\s-]', '', title).strip()  # Remove unwanted characters
+    sanitized = re.sub(r'[-\s]+', '_', sanitized)  # Replace spaces and hyphens with underscores
+    if len(sanitized) > max_length:
+        sanitized = sanitized[:max_length]
+    return sanitized
 def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
     print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
     return {
         "filename": os.path.basename(pdf_path),
+        "title": title,  # Include the title in the return data
         "content": content
     }
     print("🟡 Converting Processed Data to Parquet")
     # ✅ Step 2: Convert to Parquet
     df = pd.DataFrame(all_data)
+    # Generate the parquet file name
+    if len(all_data) == 1:
+        paper_title = all_data[0].get("title", "").strip()
+        if paper_title:
+            safe_title = sanitize_title(paper_title)
+            parquet_file = f"{safe_title}.parquet"
+        else:
+            parquet_file = 'fully_labeled_papers.parquet'
+    else:
+        # For multiple PDFs, include a timestamp to avoid overwrites
+        parquet_file = f"fully_labeled_papers_{time.strftime('%Y%m%d_%H%M%S')}.parquet"
     try:
         df.to_parquet(parquet_file, engine='pyarrow', index=False)
     )
 demo.launch()