Ephraimmm commited on
Commit
15754f2
·
verified ·
1 Parent(s): be5938e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -33
app.py CHANGED
@@ -53,6 +53,7 @@ class GPTDriveIntegration:
53
  """Download and extract text content from file"""
54
  try:
55
  if 'text' in mime_type or 'document' in mime_type:
 
56
  if 'document' in mime_type:
57
  request = self.drive_service.files().export_media(
58
  fileId=file_id, mimeType='text/plain'
@@ -66,7 +67,7 @@ class GPTDriveIntegration:
66
  while done is False:
67
  status, done = downloader.next_chunk()
68
 
69
- return file_content.getvalue().decode('utf-8')
70
 
71
  elif 'spreadsheet' in mime_type:
72
  # For Google Sheets, export as CSV
@@ -90,10 +91,9 @@ class GPTDriveIntegration:
90
  while done is False:
91
  status, done = downloader.next_chunk()
92
 
93
- # Extract text from PDF using PyPDF2 or pdfplumber
94
- file_content.seek(0) # Reset buffer position
95
 
96
- # Option 1: Using PyPDF2
97
  try:
98
  import PyPDF2
99
  pdf_reader = PyPDF2.PdfReader(file_content)
@@ -102,35 +102,7 @@ class GPTDriveIntegration:
102
  text += page.extract_text() + "\n"
103
  return text
104
  except ImportError:
105
- pass
106
-
107
- # Option 2: Using pdfplumber (better for complex PDFs)
108
- try:
109
- import pdfplumber
110
- text = ""
111
- with pdfplumber.open(file_content) as pdf:
112
- for page in pdf.pages:
113
- page_text = page.extract_text()
114
- if page_text:
115
- text += page_text + "\n"
116
- return text
117
- except ImportError:
118
- pass
119
-
120
- # Option 3: Using pymupdf (fitz) - fastest option
121
- try:
122
- import fitz # pymupdf
123
- pdf_document = fitz.open(stream=file_content.read(), filetype="pdf")
124
- text = ""
125
- for page_num in range(pdf_document.page_count):
126
- page = pdf_document[page_num]
127
- text += page.get_text() + "\n"
128
- pdf_document.close()
129
- return text
130
- except ImportError:
131
- pass
132
-
133
- return "PDF text extraction requires PyPDF2, pdfplumber, or pymupdf library"
134
 
135
  else:
136
  return "File type not supported for text extraction"
 
53
  """Download and extract text content from file"""
54
  try:
55
  if 'text' in mime_type or 'document' in mime_type:
56
+ # For Google Docs, export as plain text
57
  if 'document' in mime_type:
58
  request = self.drive_service.files().export_media(
59
  fileId=file_id, mimeType='text/plain'
 
67
  while done is False:
68
  status, done = downloader.next_chunk()
69
 
70
+ return file_content.getvalue().decode('utf-8')
71
 
72
  elif 'spreadsheet' in mime_type:
73
  # For Google Sheets, export as CSV
 
91
  while done is False:
92
  status, done = downloader.next_chunk()
93
 
94
+ # Extract text from PDF
95
+ file_content.seek(0)
96
 
 
97
  try:
98
  import PyPDF2
99
  pdf_reader = PyPDF2.PdfReader(file_content)
 
102
  text += page.extract_text() + "\n"
103
  return text
104
  except ImportError:
105
+ return "PDF text extraction requires PyPDF2 library"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  else:
108
  return "File type not supported for text extraction"