Jakecole1's picture
Upload 18 files
863cb78 verified
import base64
from io import BytesIO
class RequirementsIngest:
def __init__(self):
pass
def ingest_requirements_document(self, file_obj) -> dict:
"""
Ingest a requirements document from a file-like object.
Supports both TXT and PDF files.
Returns:
dict: {
'type': 'text' or 'pdf',
'content': str (for text) or base64 string (for PDF),
'filename': str,
'text_content': str (extracted text for PDFs, same as content for TXT)
}
"""
try:
filename = getattr(file_obj, 'name', 'unknown')
file_extension = filename.lower().split('.')[-1] if '.' in filename else ''
if file_extension == 'pdf':
# Handle PDF file
file_obj.seek(0)
pdf_content = file_obj.read()
# Convert PDF to base64 for Claude
pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
# For PDFs, we'll extract text content for backward compatibility
# but the main content will be the PDF itself
try:
# Try to extract text using PyPDF2 if available
try:
from PyPDF2 import PdfReader
import io
# Reset file pointer and read PDF
file_obj.seek(0)
pdf_content = file_obj.read()
pdf_stream = io.BytesIO(pdf_content)
# Extract text from PDF
reader = PdfReader(pdf_stream)
text_content = ""
for page in reader.pages:
text_content += page.extract_text() + "\n"
if not text_content.strip():
text_content = f"PDF Requirements Document: {filename} (no text content found)"
else:
# Limit text content for display
text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content
except ImportError:
# PyPDF2 not available, use basic description
text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)"
except Exception as e:
text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
except Exception as e:
text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
return {
'type': 'pdf',
'content': pdf_base64,
'filename': filename,
'text_content': text_content,
'file_size': len(pdf_content)
}
else:
# Handle text file (default behavior)
file_obj.seek(0)
text = file_obj.read()
if isinstance(text, bytes):
text = text.decode("utf-8", errors="replace")
return {
'type': 'text',
'content': text,
'filename': filename,
'text_content': text,
'file_size': len(text.encode('utf-8'))
}
except Exception as e:
raise ValueError(f"Error reading requirements document: {e}")