import base64 from io import BytesIO class RequirementsIngest: def __init__(self): pass def ingest_requirements_document(self, file_obj) -> dict: """ Ingest a requirements document from a file-like object. Supports both TXT and PDF files. Returns: dict: { 'type': 'text' or 'pdf', 'content': str (for text) or base64 string (for PDF), 'filename': str, 'text_content': str (extracted text for PDFs, same as content for TXT) } """ try: filename = getattr(file_obj, 'name', 'unknown') file_extension = filename.lower().split('.')[-1] if '.' in filename else '' if file_extension == 'pdf': # Handle PDF file file_obj.seek(0) pdf_content = file_obj.read() # Convert PDF to base64 for Claude pdf_base64 = base64.b64encode(pdf_content).decode('utf-8') # For PDFs, we'll extract text content for backward compatibility # but the main content will be the PDF itself try: # Try to extract text using PyPDF2 if available try: from PyPDF2 import PdfReader import io # Reset file pointer and read PDF file_obj.seek(0) pdf_content = file_obj.read() pdf_stream = io.BytesIO(pdf_content) # Extract text from PDF reader = PdfReader(pdf_stream) text_content = "" for page in reader.pages: text_content += page.extract_text() + "\n" if not text_content.strip(): text_content = f"PDF Requirements Document: {filename} (no text content found)" else: # Limit text content for display text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content except ImportError: # PyPDF2 not available, use basic description text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)" except Exception as e: text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})" except Exception as e: text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})" return { 'type': 'pdf', 'content': pdf_base64, 'filename': filename, 'text_content': text_content, 'file_size': len(pdf_content) } else: # Handle text file (default behavior) file_obj.seek(0) text = file_obj.read() if isinstance(text, bytes): text = text.decode("utf-8", errors="replace") return { 'type': 'text', 'content': text, 'filename': filename, 'text_content': text, 'file_size': len(text.encode('utf-8')) } except Exception as e: raise ValueError(f"Error reading requirements document: {e}")