Spaces:
Runtime error
Runtime error
| # # utils/file_processor.py | |
| # import os | |
| # import json | |
| # import csv | |
| # import docx # From python-docx | |
| # import PyPDF2 | |
| # class FileProcessor: | |
| # """ | |
| # A utility class to process various file types and extract their text content. | |
| # Supports .txt, .pdf, .docx, .json, and .csv files. | |
| # """ | |
| # def __init__(self): | |
| # """Initializes the FileProcessor.""" | |
| # pass | |
| # def extract_text(self, file_path: str) -> str: | |
| # """ | |
| # Extracts text content from a given file based on its extension. | |
| # Args: | |
| # file_path (str): The full path to the file. | |
| # Returns: | |
| # str: The extracted text content, or an empty string if extraction fails. | |
| # """ | |
| # if not os.path.exists(file_path): | |
| # print(f"Warning: File not found at {file_path}") | |
| # return "" | |
| # # Get the file extension and normalize it | |
| # _, extension = os.path.splitext(file_path) | |
| # extension = extension.lower() | |
| # try: | |
| # if extension == '.txt': | |
| # return self._read_txt(file_path) | |
| # elif extension == '.pdf': | |
| # return self._read_pdf(file_path) | |
| # elif extension == '.docx': | |
| # return self._read_docx(file_path) | |
| # elif extension == '.json': | |
| # return self._read_json(file_path) | |
| # elif extension == '.csv': | |
| # return self._read_csv(file_path) | |
| # elif extension == '.doc': | |
| # return "Legacy .doc files are not supported. Please convert to .docx." | |
| # else: | |
| # print(f"Warning: Unsupported file type: {extension}") | |
| # return "" | |
| # except Exception as e: | |
| # print(f"Error processing file {file_path}: {e}") | |
| # return f"Error extracting content from file. It may be corrupted or protected." | |
| # def _read_txt(self, file_path: str) -> str: | |
| # """Reads content from a .txt file.""" | |
| # with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| # return f.read() | |
| # def _read_pdf(self, file_path: str) -> str: | |
| # """Reads content from a .pdf file using PyPDF2.""" | |
| # text = [] | |
| # with open(file_path, 'rb') as f: | |
| # reader = PyPDF2.PdfReader(f) | |
| # for page in reader.pages: | |
| # page_text = page.extract_text() | |
| # if page_text: | |
| # text.append(page_text) | |
| # return "\n".join(text) | |
| # def _read_docx(self, file_path: str) -> str: | |
| # """Reads content from a .docx file using python-docx.""" | |
| # doc = docx.Document(file_path) | |
| # text = [p.text for p in doc.paragraphs] | |
| # return "\n".join(text) | |
| # def _read_json(self, file_path: str) -> str: | |
| # """Reads and pretty-prints content from a .json file.""" | |
| # with open(file_path, 'r', encoding='utf-8') as f: | |
| # data = json.load(f) | |
| # # Convert JSON object to a nicely formatted string | |
| # return json.dumps(data, indent=2) | |
| # def _read_csv(self, file_path: str) -> str: | |
| # """Reads content from a .csv file and formats it as a string.""" | |
| # text = [] | |
| # with open(file_path, 'r', encoding='utf-8', newline='') as f: | |
| # reader = csv.reader(f) | |
| # for row in reader: | |
| # text.append(", ".join(row)) | |
| # return "\n".join(text) | |
| # utils/file_processor.py | |
| import os | |
| import json | |
| import csv | |
| import docx | |
| import fitz # PyMuPDF library | |
| class FileProcessor: | |
| """ | |
| A utility class to process various file types and extract their text content. | |
| Now uses the powerful PyMuPDF library for superior PDF text extraction. | |
| """ | |
| def extract_text(self, file_path: str) -> str: | |
| """ | |
| Extracts text content from a given file based on its extension. | |
| """ | |
| if not os.path.exists(file_path): | |
| print(f"Warning: File not found at {file_path}") | |
| return "" | |
| _, extension = os.path.splitext(file_path) | |
| extension = extension.lower() | |
| try: | |
| if extension == '.txt': | |
| return self._read_txt(file_path) | |
| elif extension == '.pdf': | |
| # Using the new, better PDF reader | |
| return self._read_pdf_with_pymupdf(file_path) | |
| elif extension == '.docx': | |
| return self._read_docx(file_path) | |
| elif extension == '.json': | |
| return self._read_json(file_path) | |
| elif extension == '.csv': | |
| return self._read_csv(file_path) | |
| elif extension == '.doc': | |
| return "Legacy .doc files are not supported. Please convert to .docx." | |
| else: | |
| print(f"Warning: Unsupported file type: {extension}") | |
| return "" | |
| except Exception as e: | |
| print(f"Error processing file {file_path}: {e}") | |
| return f"Error extracting content from file. It may be corrupted or protected." | |
| def _read_txt(self, file_path: str) -> str: | |
| """Reads content from a .txt file.""" | |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| return f.read() | |
| def _read_pdf_with_pymupdf(self, file_path: str) -> str: | |
| """Reads content from a .pdf file using the PyMuPDF (fitz) library.""" | |
| text = [] | |
| with fitz.open(file_path) as doc: | |
| for page in doc: | |
| text.append(page.get_text()) | |
| return "\n".join(text) | |
| def _read_docx(self, file_path: str) -> str: | |
| """Reads content from a .docx file using python-docx.""" | |
| doc = docx.Document(file_path) | |
| text = [p.text for p in doc.paragraphs] | |
| return "\n".join(text) | |
| def _read_json(self, file_path: str) -> str: | |
| """Reads and pretty-prints content from a .json file.""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return json.dumps(data, indent=2) | |
| def _read_csv(self, file_path: str) -> str: | |
| """Reads content from a .csv file and formats it as a string.""" | |
| text = [] | |
| with open(file_path, 'r', encoding='utf-8', newline='') as f: | |
| reader = csv.reader(f) | |
| for row in reader: | |
| text.append(", ".join(row)) | |
| return "\n".join(text) |