pdf2AIextract

Sleeping

ShayanRl commited on Jul 17, 2024

Commit

c45f030

verified ·

1 Parent(s): a673cfb

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,16 +6,41 @@ import requests
 import pdfplumber
 def fextractURL(pdf_path):
-    r = requests.get(pdf_path)
-    f = io.BytesIO(r.content)
     extracted_data = ""
-    with pdfplumber.open(f) as pdf:
-        for page in pdf.pages:
-            extracted_data += page.extract_text() + "\n"  # Extract text
-            tables = page.extract_tables()  # Extract tables
-            for table in tables:
-                for row in table:
-                    extracted_data += "\t".join(str(cell) for cell in row) + "\n"
     return extracted_data

 import pdfplumber
 def fextractURL(pdf_path):
     extracted_data = ""
+    if pdf_path.endswith('.pdf'):
+        # If the URL ends with .pdf, use pdfplumber directly
+        r = requests.get(pdf_path)
+        f = io.BytesIO(r.content)
+        with pdfplumber.open(f) as pdf:
+            for page in pdf.pages:
+                extracted_data += page.extract_text() + "\n"  # Extract text
+                tables = page.extract_tables()  # Extract tables
+                for table in tables:
+                    for row in table:
+                        extracted_data += "\t".join(str(cell) for cell in row) + "\n"
+    else:
+        # If the URL does not end with .pdf, download the PDF first
+        response = requests.get(pdf_path)
+        pdf_content = response.content
+        # Save the PDF locally
+        pdf_filename = 'downloaded_document.pdf'
+        with open(pdf_filename, 'wb') as pdf_file:
+            pdf_file.write(pdf_content)
+        # Extract content using pdfplumber
+        with pdfplumber.open(pdf_filename) as pdf:
+            for page in pdf.pages:
+                extracted_data += page.extract_text() + "\n"  # Extract text
+                tables = page.extract_tables()  # Extract tables
+                for table in tables:
+                    for row in table:
+                        extracted_data += "\t".join(str(cell) for cell in row) + "\n"
+        # Delete the PDF file
+        os.remove(pdf_filename)
     return extracted_data