ShayanRl commited on
Commit
c45f030
·
verified ·
1 Parent(s): a673cfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -9
app.py CHANGED
@@ -6,16 +6,41 @@ import requests
6
  import pdfplumber
7
 
8
  def fextractURL(pdf_path):
9
- r = requests.get(pdf_path)
10
- f = io.BytesIO(r.content)
11
  extracted_data = ""
12
- with pdfplumber.open(f) as pdf:
13
- for page in pdf.pages:
14
- extracted_data += page.extract_text() + "\n" # Extract text
15
- tables = page.extract_tables() # Extract tables
16
- for table in tables:
17
- for row in table:
18
- extracted_data += "\t".join(str(cell) for cell in row) + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  return extracted_data
20
 
21
 
 
6
  import pdfplumber
7
 
8
  def fextractURL(pdf_path):
 
 
9
  extracted_data = ""
10
+
11
+ if pdf_path.endswith('.pdf'):
12
+ # If the URL ends with .pdf, use pdfplumber directly
13
+ r = requests.get(pdf_path)
14
+ f = io.BytesIO(r.content)
15
+ with pdfplumber.open(f) as pdf:
16
+ for page in pdf.pages:
17
+ extracted_data += page.extract_text() + "\n" # Extract text
18
+ tables = page.extract_tables() # Extract tables
19
+ for table in tables:
20
+ for row in table:
21
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
22
+ else:
23
+ # If the URL does not end with .pdf, download the PDF first
24
+ response = requests.get(pdf_path)
25
+ pdf_content = response.content
26
+
27
+ # Save the PDF locally
28
+ pdf_filename = 'downloaded_document.pdf'
29
+ with open(pdf_filename, 'wb') as pdf_file:
30
+ pdf_file.write(pdf_content)
31
+
32
+ # Extract content using pdfplumber
33
+ with pdfplumber.open(pdf_filename) as pdf:
34
+ for page in pdf.pages:
35
+ extracted_data += page.extract_text() + "\n" # Extract text
36
+ tables = page.extract_tables() # Extract tables
37
+ for table in tables:
38
+ for row in table:
39
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
40
+
41
+ # Delete the PDF file
42
+ os.remove(pdf_filename)
43
+
44
  return extracted_data
45
 
46