| | import base64 |
| | from io import BytesIO |
| |
|
| | class RequirementsIngest: |
| | def __init__(self): |
| | pass |
| |
|
| | def ingest_requirements_document(self, file_obj) -> dict: |
| | """ |
| | Ingest a requirements document from a file-like object. |
| | Supports both TXT and PDF files. |
| | |
| | Returns: |
| | dict: { |
| | 'type': 'text' or 'pdf', |
| | 'content': str (for text) or base64 string (for PDF), |
| | 'filename': str, |
| | 'text_content': str (extracted text for PDFs, same as content for TXT) |
| | } |
| | """ |
| | try: |
| | filename = getattr(file_obj, 'name', 'unknown') |
| | file_extension = filename.lower().split('.')[-1] if '.' in filename else '' |
| | |
| | if file_extension == 'pdf': |
| | |
| | file_obj.seek(0) |
| | pdf_content = file_obj.read() |
| | |
| | |
| | pdf_base64 = base64.b64encode(pdf_content).decode('utf-8') |
| | |
| | |
| | |
| | try: |
| | |
| | try: |
| | from PyPDF2 import PdfReader |
| | import io |
| | |
| | |
| | file_obj.seek(0) |
| | pdf_content = file_obj.read() |
| | pdf_stream = io.BytesIO(pdf_content) |
| | |
| | |
| | reader = PdfReader(pdf_stream) |
| | text_content = "" |
| | for page in reader.pages: |
| | text_content += page.extract_text() + "\n" |
| | |
| | if not text_content.strip(): |
| | text_content = f"PDF Requirements Document: {filename} (no text content found)" |
| | else: |
| | |
| | text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content |
| | |
| | except ImportError: |
| | |
| | text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)" |
| | except Exception as e: |
| | text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})" |
| | |
| | except Exception as e: |
| | text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})" |
| | |
| | return { |
| | 'type': 'pdf', |
| | 'content': pdf_base64, |
| | 'filename': filename, |
| | 'text_content': text_content, |
| | 'file_size': len(pdf_content) |
| | } |
| | else: |
| | |
| | file_obj.seek(0) |
| | text = file_obj.read() |
| | if isinstance(text, bytes): |
| | text = text.decode("utf-8", errors="replace") |
| | |
| | return { |
| | 'type': 'text', |
| | 'content': text, |
| | 'filename': filename, |
| | 'text_content': text, |
| | 'file_size': len(text.encode('utf-8')) |
| | } |
| | |
| | except Exception as e: |
| | raise ValueError(f"Error reading requirements document: {e}") |
| |
|
| |
|
| |
|