Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -53,6 +53,7 @@ class GPTDriveIntegration:
|
|
| 53 |
"""Download and extract text content from file"""
|
| 54 |
try:
|
| 55 |
if 'text' in mime_type or 'document' in mime_type:
|
|
|
|
| 56 |
if 'document' in mime_type:
|
| 57 |
request = self.drive_service.files().export_media(
|
| 58 |
fileId=file_id, mimeType='text/plain'
|
|
@@ -66,7 +67,7 @@ class GPTDriveIntegration:
|
|
| 66 |
while done is False:
|
| 67 |
status, done = downloader.next_chunk()
|
| 68 |
|
| 69 |
-
|
| 70 |
|
| 71 |
elif 'spreadsheet' in mime_type:
|
| 72 |
# For Google Sheets, export as CSV
|
|
@@ -90,10 +91,9 @@ class GPTDriveIntegration:
|
|
| 90 |
while done is False:
|
| 91 |
status, done = downloader.next_chunk()
|
| 92 |
|
| 93 |
-
# Extract text from PDF
|
| 94 |
-
file_content.seek(0)
|
| 95 |
|
| 96 |
-
# Option 1: Using PyPDF2
|
| 97 |
try:
|
| 98 |
import PyPDF2
|
| 99 |
pdf_reader = PyPDF2.PdfReader(file_content)
|
|
@@ -102,35 +102,7 @@ class GPTDriveIntegration:
|
|
| 102 |
text += page.extract_text() + "\n"
|
| 103 |
return text
|
| 104 |
except ImportError:
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
# Option 2: Using pdfplumber (better for complex PDFs)
|
| 108 |
-
try:
|
| 109 |
-
import pdfplumber
|
| 110 |
-
text = ""
|
| 111 |
-
with pdfplumber.open(file_content) as pdf:
|
| 112 |
-
for page in pdf.pages:
|
| 113 |
-
page_text = page.extract_text()
|
| 114 |
-
if page_text:
|
| 115 |
-
text += page_text + "\n"
|
| 116 |
-
return text
|
| 117 |
-
except ImportError:
|
| 118 |
-
pass
|
| 119 |
-
|
| 120 |
-
# Option 3: Using pymupdf (fitz) - fastest option
|
| 121 |
-
try:
|
| 122 |
-
import fitz # pymupdf
|
| 123 |
-
pdf_document = fitz.open(stream=file_content.read(), filetype="pdf")
|
| 124 |
-
text = ""
|
| 125 |
-
for page_num in range(pdf_document.page_count):
|
| 126 |
-
page = pdf_document[page_num]
|
| 127 |
-
text += page.get_text() + "\n"
|
| 128 |
-
pdf_document.close()
|
| 129 |
-
return text
|
| 130 |
-
except ImportError:
|
| 131 |
-
pass
|
| 132 |
-
|
| 133 |
-
return "PDF text extraction requires PyPDF2, pdfplumber, or pymupdf library"
|
| 134 |
|
| 135 |
else:
|
| 136 |
return "File type not supported for text extraction"
|
|
|
|
| 53 |
"""Download and extract text content from file"""
|
| 54 |
try:
|
| 55 |
if 'text' in mime_type or 'document' in mime_type:
|
| 56 |
+
# For Google Docs, export as plain text
|
| 57 |
if 'document' in mime_type:
|
| 58 |
request = self.drive_service.files().export_media(
|
| 59 |
fileId=file_id, mimeType='text/plain'
|
|
|
|
| 67 |
while done is False:
|
| 68 |
status, done = downloader.next_chunk()
|
| 69 |
|
| 70 |
+
return file_content.getvalue().decode('utf-8')
|
| 71 |
|
| 72 |
elif 'spreadsheet' in mime_type:
|
| 73 |
# For Google Sheets, export as CSV
|
|
|
|
| 91 |
while done is False:
|
| 92 |
status, done = downloader.next_chunk()
|
| 93 |
|
| 94 |
+
# Extract text from PDF
|
| 95 |
+
file_content.seek(0)
|
| 96 |
|
|
|
|
| 97 |
try:
|
| 98 |
import PyPDF2
|
| 99 |
pdf_reader = PyPDF2.PdfReader(file_content)
|
|
|
|
| 102 |
text += page.extract_text() + "\n"
|
| 103 |
return text
|
| 104 |
except ImportError:
|
| 105 |
+
return "PDF text extraction requires PyPDF2 library"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
else:
|
| 108 |
return "File type not supported for text extraction"
|