Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| from docx import Document | |
| import docx2txt | |
| import io | |
| from typing import Union | |
| class DocumentProcessor: | |
| """Process different document formats (PDF, DOCX, DOC) and extract text""" | |
| def extract_text_from_pdf(file) -> str: | |
| """Extract text from PDF file""" | |
| try: | |
| # Ensure we're at the beginning of the file | |
| if hasattr(file, 'seek'): | |
| file.seek(0) | |
| reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| result = text.strip() | |
| print(f"[DEBUG] Extracted {len(result)} characters from PDF") | |
| return result | |
| except Exception as e: | |
| print(f"Error extracting text from PDF: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| return "" | |
| def extract_text_from_docx(file) -> str: | |
| """Extract text from DOCX file""" | |
| try: | |
| # Try using python-docx first | |
| try: | |
| doc = Document(file) | |
| text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
| if text.strip(): | |
| return text.strip() | |
| except: | |
| pass | |
| # Fallback to docx2txt | |
| file.seek(0) | |
| text = docx2txt.process(file) | |
| return text.strip() | |
| except Exception as e: | |
| print(f"Error extracting text from DOCX: {str(e)}") | |
| return "" | |
| def extract_text_from_doc(file) -> str: | |
| """Extract text from DOC file (legacy Word format)""" | |
| try: | |
| # For .doc files, we'll try docx2txt which has some support | |
| text = docx2txt.process(file) | |
| return text.strip() | |
| except Exception as e: | |
| print(f"Error extracting text from DOC: {str(e)}") | |
| # If docx2txt fails, return a message | |
| return "Note: Legacy .doc format may require conversion to .docx for better text extraction." | |
| def extract_text(file, file_type: str = None) -> str: | |
| """ | |
| Extract text from any supported document format | |
| Args: | |
| file: File object or file-like object | |
| file_type: File extension (e.g., '.pdf', '.docx', '.doc') | |
| Returns: | |
| Extracted text as string | |
| """ | |
| # Determine file type if not provided | |
| if file_type is None: | |
| if hasattr(file, 'name'): | |
| file_type = file.name.split('.')[-1].lower() | |
| elif hasattr(file, 'type'): | |
| type_map = { | |
| 'application/pdf': 'pdf', | |
| 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', | |
| 'application/msword': 'doc' | |
| } | |
| file_type = type_map.get(file.type, 'pdf') | |
| else: | |
| file_type = 'pdf' # Default to PDF | |
| # Remove leading dot if present | |
| file_type = file_type.lstrip('.') | |
| # Reset file pointer to beginning | |
| if hasattr(file, 'seek'): | |
| file.seek(0) | |
| # Extract text based on file type | |
| if file_type == 'pdf': | |
| return DocumentProcessor.extract_text_from_pdf(file) | |
| elif file_type == 'docx': | |
| return DocumentProcessor.extract_text_from_docx(file) | |
| elif file_type == 'doc': | |
| return DocumentProcessor.extract_text_from_doc(file) | |
| else: | |
| return f"Unsupported file type: {file_type}" | |