Spaces:

Propelis
/

QC_Rules

Sleeping

App Files Files Community

QC_Rules / src /extract_text /ingest.py

Jakecole1

Upload 18 files

863cb78 verified 8 months ago

raw

history blame contribute delete

3.87 kB

	import base64
	from io import BytesIO

	class RequirementsIngest:
	def __init__(self):
	pass

	def ingest_requirements_document(self, file_obj) -> dict:
	"""
	Ingest a requirements document from a file-like object.
	Supports both TXT and PDF files.

	Returns:
	dict: {
	'type': 'text' or 'pdf',
	'content': str (for text) or base64 string (for PDF),
	'filename': str,
	'text_content': str (extracted text for PDFs, same as content for TXT)
	}
	"""
	try:
	filename = getattr(file_obj, 'name', 'unknown')
	file_extension = filename.lower().split('.')[-1] if '.' in filename else ''

	if file_extension == 'pdf':
	# Handle PDF file
	file_obj.seek(0)
	pdf_content = file_obj.read()

	# Convert PDF to base64 for Claude
	pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')

	# For PDFs, we'll extract text content for backward compatibility
	# but the main content will be the PDF itself
	try:
	# Try to extract text using PyPDF2 if available
	try:
	from PyPDF2 import PdfReader
	import io

	# Reset file pointer and read PDF
	file_obj.seek(0)
	pdf_content = file_obj.read()
	pdf_stream = io.BytesIO(pdf_content)

	# Extract text from PDF
	reader = PdfReader(pdf_stream)
	text_content = ""
	for page in reader.pages:
	text_content += page.extract_text() + "\n"

	if not text_content.strip():
	text_content = f"PDF Requirements Document: {filename} (no text content found)"
	else:
	# Limit text content for display
	text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content

	except ImportError:
	# PyPDF2 not available, use basic description
	text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)"
	except Exception as e:
	text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"

	except Exception as e:
	text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"

	return {
	'type': 'pdf',
	'content': pdf_base64,
	'filename': filename,
	'text_content': text_content,
	'file_size': len(pdf_content)
	}
	else:
	# Handle text file (default behavior)
	file_obj.seek(0)
	text = file_obj.read()
	if isinstance(text, bytes):
	text = text.decode("utf-8", errors="replace")

	return {
	'type': 'text',
	'content': text,
	'filename': filename,
	'text_content': text,
	'file_size': len(text.encode('utf-8'))
	}

	except Exception as e:
	raise ValueError(f"Error reading requirements document: {e}")