Spaces:

Propelis
/

QC_Rules

Sleeping

File size: 3,869 Bytes

863cb78

import base64
from io import BytesIO

class RequirementsIngest:
    def __init__(self):
        pass

    def ingest_requirements_document(self, file_obj) -> dict:
        """
        Ingest a requirements document from a file-like object.
        Supports both TXT and PDF files.
        
        Returns:
            dict: {
                'type': 'text' or 'pdf',
                'content': str (for text) or base64 string (for PDF),
                'filename': str,
                'text_content': str (extracted text for PDFs, same as content for TXT)
            }
        """
        try:
            filename = getattr(file_obj, 'name', 'unknown')
            file_extension = filename.lower().split('.')[-1] if '.' in filename else ''
            
            if file_extension == 'pdf':
                # Handle PDF file
                file_obj.seek(0)
                pdf_content = file_obj.read()
                
                # Convert PDF to base64 for Claude
                pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
                
                # For PDFs, we'll extract text content for backward compatibility
                # but the main content will be the PDF itself
                try:
                    # Try to extract text using PyPDF2 if available
                    try:
                        from PyPDF2 import PdfReader
                        import io
                        
                        # Reset file pointer and read PDF
                        file_obj.seek(0)
                        pdf_content = file_obj.read()
                        pdf_stream = io.BytesIO(pdf_content)
                        
                        # Extract text from PDF
                        reader = PdfReader(pdf_stream)
                        text_content = ""
                        for page in reader.pages:
                            text_content += page.extract_text() + "\n"
                        
                        if not text_content.strip():
                            text_content = f"PDF Requirements Document: {filename} (no text content found)"
                        else:
                            # Limit text content for display
                            text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content
                            
                    except ImportError:
                        # PyPDF2 not available, use basic description
                        text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)"
                    except Exception as e:
                        text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
                        
                except Exception as e:
                    text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
                
                return {
                    'type': 'pdf',
                    'content': pdf_base64,
                    'filename': filename,
                    'text_content': text_content,
                    'file_size': len(pdf_content)
                }
            else:
                # Handle text file (default behavior)
                file_obj.seek(0)
                text = file_obj.read()
                if isinstance(text, bytes):
                    text = text.decode("utf-8", errors="replace")
                
                return {
                    'type': 'text',
                    'content': text,
                    'filename': filename,
                    'text_content': text,
                    'file_size': len(text.encode('utf-8'))
                }
                
        except Exception as e:
            raise ValueError(f"Error reading requirements document: {e}")