Marthee commited on
Commit
ffc4abe
·
verified ·
1 Parent(s): f57c685

Update pdftotext.py

Browse files
Files changed (1) hide show
  1. pdftotext.py +52 -14
pdftotext.py CHANGED
@@ -5,19 +5,57 @@ import tsadropboxretrieval
5
 
6
  def texts_from_pdf(dbpdfpath):
7
  print('intexts')
8
- dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
9
- print('dbdone')
10
- md, res =dbxTeam.files_download(path=dbpdfpath)
11
- print('downloaded')
12
- dataDoc = res.content
13
- print('l')
14
- pdf_document = fitz.open('pdf',dataDoc)
15
- print('k')
16
- alltexts=''
17
- for page_num in range(pdf_document.page_count):
18
- page = pdf_document[page_num]
19
- text_instances = page.get_text()
20
- alltexts+=text_instances
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # alltexts = alltexts.replace('\n', ' ')
 
 
 
 
 
 
 
 
 
23
  return alltexts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def texts_from_pdf(dbpdfpath):
7
  print('intexts')
8
+
9
+ pdf_content = None
10
+
11
+ # Case 1: If it's a shareable link
12
+ if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
13
+ # Modify Dropbox link for direct download
14
+ if 'dl=0' in pdfshareablelink:
15
+ pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
16
+
17
+ # Download the PDF content from the shareable link
18
+ response = requests.get(pdfshareablelink)
19
+ pdf_content = BytesIO(response.content) # Store the content in memory
20
+ print('Downloaded from shareable link.')
21
+
22
+ # Case 2: If it's a Dropbox path, use the Dropbox API to download
23
+ elif dbpdfpath:
24
+ dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
25
+ print('Dropbox team access initialized.')
26
+ md, res = dbxTeam.files_download(path=dbpdfpath)
27
+ pdf_content = BytesIO(res.content) # Store the content in memory
28
+ print('Downloaded from Dropbox path.')
29
+
30
+ # Check if the PDF content is available
31
+ if pdf_content is None:
32
+ raise ValueError("No valid PDF content found.")
33
 
34
+ # Open the PDF using fitz (PyMuPDF) directly from memory
35
+ pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
36
+ print('PDF opened in memory.')
37
+
38
+ alltexts = ''
39
+ for page_num in range(pdf_document.page_count):
40
+ page = pdf_document.load_page(page_num)
41
+ text_instances = page.get_text() # Extract text from each page
42
+ alltexts += text_instances
43
+
44
  return alltexts
45
+ # print('intexts')
46
+ # dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
47
+ # print('dbdone')
48
+ # md, res =dbxTeam.files_download(path=dbpdfpath)
49
+ # print('downloaded')
50
+ # dataDoc = res.content
51
+ # print('l')
52
+ # pdf_document = fitz.open('pdf',dataDoc)
53
+ # print('k')
54
+ # alltexts=''
55
+ # for page_num in range(pdf_document.page_count):
56
+ # page = pdf_document[page_num]
57
+ # text_instances = page.get_text()
58
+ # alltexts+=text_instances
59
+
60
+ # # alltexts = alltexts.replace('\n', ' ')
61
+ # return alltexts