| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import List |
|
|
| import pandas as pd |
|
|
| """ |
| Why dataclass decoration? |
| - Cleaner syntax for simple data containers. |
| - Automatic generation of __init__, __repr__, and other methods. |
| - Ideal for the Document class, which is just a structured way to hold data. |
| """ |
|
|
| @dataclass |
| class Document: |
| """ |
| Represents one loaded document. |
| |
| source: |
| - relative file path used for source attribution |
| |
| text: |
| - extracted plain text |
| |
| file_type: |
| - original file extension |
| |
| character_count: |
| - useful for debugging and monitoring |
| """ |
|
|
| source: str |
| text: str |
| file_type: str |
| character_count: int |
|
|
|
|
| def read_text_file(path: Path) -> str: |
| """ |
| Read a normal text file. |
| |
| errors='ignore' prevents a full crash if the file contains |
| unusual encoding characters. |
| """ |
| return path.read_text(encoding="utf-8", errors="ignore") |
|
|
|
|
| def read_csv_file(path: Path) -> str: |
| """ |
| Read a CSV file and convert each row into readable text. |
| |
| Why convert CSV to text? |
| RAG retrieval works on text chunks. A row must become text before |
| it can be embedded and retrieved. |
| """ |
|
|
| df = pd.read_csv(path) |
| lines = [] |
|
|
| for row_index, row in df.iterrows(): |
| row_parts = [] |
|
|
| for column_name, value in row.items(): |
| row_parts.append(f"{column_name}: {value}") |
|
|
| lines.append(f"Row {row_index + 1}: " + " | ".join(row_parts)) |
|
|
| return "\n".join(lines) |
|
|
|
|
| def read_pdf_file(path: Path) -> str: |
| """ |
| Extract text from a PDF file. |
| |
| Important limitation: |
| pypdf works for text-based PDFs. |
| It may not work for scanned image PDFs. |
| |
| Production options for scanned PDFs: |
| - Tesseract OCR |
| - AWS Textract |
| - Azure Document Intelligence |
| - Google Document AI |
| """ |
|
|
| try: |
| from pypdf import PdfReader |
| except ImportError as error: |
| raise ImportError("pypdf is not installed. Run: pip install pypdf") from error |
|
|
| reader = PdfReader(str(path)) |
| pages = [] |
|
|
| for page_number, page in enumerate(reader.pages, start=1): |
| page_text = page.extract_text() or "" |
| pages.append(f"\n--- Page {page_number} ---\n{page_text}") |
|
|
| return "\n".join(pages) |
|
|
|
|
| def load_single_document(path: Path, project_root: Path) -> Document: |
| """ |
| Load one supported document and return a Document object. |
| |
| This function keeps file-type-specific logic in one place. |
| """ |
|
|
| extension = path.suffix.lower() |
|
|
| if extension in {".txt", ".md"}: |
| text = read_text_file(path) |
| elif extension == ".csv": |
| text = read_csv_file(path) |
| elif extension == ".pdf": |
| text = read_pdf_file(path) |
| else: |
| raise ValueError(f"Unsupported file type: {extension}") |
|
|
| text = text.strip() |
|
|
| return Document( |
| source=str(path.relative_to(project_root)), |
| text=text, |
| file_type=extension, |
| character_count=len(text), |
| ) |
|
|
|
|
| def load_documents(folder: Path, project_root: Path) -> List[Document]: |
| """ |
| Load all supported documents from a folder. |
| |
| Returns: |
| List[Document] |
| |
| AI ENGINEER PRODUCTION TIP: |
| Always keep source metadata. Without source metadata, your app |
| cannot explain where an answer came from. |
| """ |
|
|
| supported_extensions = {".txt", ".md", ".csv", ".pdf"} |
| documents = [] |
|
|
| for path in sorted(folder.rglob("*")): |
| if not path.is_file(): |
| continue |
|
|
| if path.suffix.lower() not in supported_extensions: |
| continue |
|
|
| try: |
| document = load_single_document(path=path, project_root=project_root) |
|
|
| if document.text: |
| documents.append(document) |
|
|
| except Exception as error: |
| print(f"Could not load file: {path}") |
| print(f"Reason: {error}") |
|
|
| return documents |