Spaces:
Sleeping
Sleeping
File size: 3,796 Bytes
4ede186 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import PyPDF2
from docx import Document
import docx2txt
import io
from typing import Union
class DocumentProcessor:
"""Process different document formats (PDF, DOCX, DOC) and extract text"""
@staticmethod
def extract_text_from_pdf(file) -> str:
"""Extract text from PDF file"""
try:
# Ensure we're at the beginning of the file
if hasattr(file, 'seek'):
file.seek(0)
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
result = text.strip()
print(f"[DEBUG] Extracted {len(result)} characters from PDF")
return result
except Exception as e:
print(f"Error extracting text from PDF: {str(e)}")
import traceback
traceback.print_exc()
return ""
@staticmethod
def extract_text_from_docx(file) -> str:
"""Extract text from DOCX file"""
try:
# Try using python-docx first
try:
doc = Document(file)
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
if text.strip():
return text.strip()
except:
pass
# Fallback to docx2txt
file.seek(0)
text = docx2txt.process(file)
return text.strip()
except Exception as e:
print(f"Error extracting text from DOCX: {str(e)}")
return ""
@staticmethod
def extract_text_from_doc(file) -> str:
"""Extract text from DOC file (legacy Word format)"""
try:
# For .doc files, we'll try docx2txt which has some support
text = docx2txt.process(file)
return text.strip()
except Exception as e:
print(f"Error extracting text from DOC: {str(e)}")
# If docx2txt fails, return a message
return "Note: Legacy .doc format may require conversion to .docx for better text extraction."
@staticmethod
def extract_text(file, file_type: str = None) -> str:
"""
Extract text from any supported document format
Args:
file: File object or file-like object
file_type: File extension (e.g., '.pdf', '.docx', '.doc')
Returns:
Extracted text as string
"""
# Determine file type if not provided
if file_type is None:
if hasattr(file, 'name'):
file_type = file.name.split('.')[-1].lower()
elif hasattr(file, 'type'):
type_map = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/msword': 'doc'
}
file_type = type_map.get(file.type, 'pdf')
else:
file_type = 'pdf' # Default to PDF
# Remove leading dot if present
file_type = file_type.lstrip('.')
# Reset file pointer to beginning
if hasattr(file, 'seek'):
file.seek(0)
# Extract text based on file type
if file_type == 'pdf':
return DocumentProcessor.extract_text_from_pdf(file)
elif file_type == 'docx':
return DocumentProcessor.extract_text_from_docx(file)
elif file_type == 'doc':
return DocumentProcessor.extract_text_from_doc(file)
else:
return f"Unsupported file type: {file_type}"
|