Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced Document Processor - Ensure complete document generation with all sections | |
| Fixes the issue where only Title, TOC, Executive Summary, and References were appearing | |
| """ | |
| import logging | |
| from typing import Dict, List, Optional, Tuple, Any | |
| from datetime import datetime | |
| logger = logging.getLogger(__name__) | |
| class EnhancedDocumentProcessor: | |
| """ | |
| Process and assemble complete documents ensuring all sections are included. | |
| Validates that generated documents contain all expected sections before output. | |
| """ | |
| def __init__(self): | |
| """Initialize enhanced document processor.""" | |
| self.required_sections_by_type = { | |
| "research": [ | |
| "Introduction", | |
| "Literature Review", | |
| "Methodology", | |
| "Results", | |
| "Discussion", | |
| "Conclusion", | |
| ], | |
| "essay": [ | |
| "Introduction", | |
| "Body", | |
| "Analysis", | |
| "Conclusion", | |
| ], | |
| "report": [ | |
| "Executive Summary", | |
| "Introduction", | |
| "Findings", | |
| "Analysis", | |
| "Recommendations", | |
| "Conclusion", | |
| ], | |
| "lab": [ | |
| "Objective", | |
| "Procedure", | |
| "Results", | |
| "Analysis", | |
| "Conclusion", | |
| ], | |
| "thesis": [ | |
| "Introduction", | |
| "Literature Review", | |
| "Methodology", | |
| "Results", | |
| "Discussion", | |
| "Implications", | |
| "Conclusion", | |
| ], | |
| } | |
| def assemble_complete_document( | |
| self, | |
| title: str, | |
| content_sections: Dict[str, str], | |
| author: str = "AI Academic Suite", | |
| document_type: str = "research", | |
| include_toc: bool = True, | |
| include_citations: bool = False, | |
| citations: Optional[List[str]] = None, | |
| ) -> Tuple[Dict[str, str], List[str]]: | |
| """ | |
| Assemble complete document with all sections and metadata. | |
| Args: | |
| title: Document title | |
| content_sections: Dictionary of section_name -> content | |
| author: Document author | |
| document_type: Type of document | |
| include_toc: Include table of contents | |
| include_citations: Include bibliography | |
| citations: List of citations | |
| Returns: | |
| Tuple of (complete_document_dict, validation_messages) | |
| """ | |
| validation_messages = [] | |
| # Validate content sections | |
| validation_messages.extend(self._validate_sections(content_sections, document_type)) | |
| # Ensure all required sections are present | |
| complete_sections = self._ensure_all_sections_present( | |
| content_sections, document_type, title | |
| ) | |
| # Build complete document with proper ordering | |
| ordered_document = self._create_ordered_document( | |
| title=title, | |
| sections=complete_sections, | |
| author=author, | |
| include_toc=include_toc, | |
| include_citations=include_citations, | |
| citations=citations, | |
| ) | |
| # Final validation | |
| validation_messages.extend(self._validate_final_document(ordered_document)) | |
| return ordered_document, validation_messages | |
| def _validate_sections( | |
| self, sections: Dict[str, str], document_type: str | |
| ) -> List[str]: | |
| """Validate that sections have appropriate content.""" | |
| messages = [] | |
| for section_name, content in sections.items(): | |
| if not content or len(content.strip()) < 50: | |
| messages.append( | |
| f"⚠️ WARNING: Section '{section_name}' appears empty or too short" | |
| ) | |
| # Check for placeholder content | |
| if self._contains_placeholder(content): | |
| messages.append( | |
| f"⚠️ WARNING: Section '{section_name}' contains placeholder text" | |
| ) | |
| return messages | |
| def _contains_placeholder(self, text: str) -> bool: | |
| """Check if text contains placeholder patterns.""" | |
| placeholders = [ | |
| "[content here]", | |
| "[placeholder]", | |
| "TODO:", | |
| "[INSERT", | |
| "[REMOVE", | |
| "placeholder", | |
| "blah blah", | |
| "lorem ipsum", | |
| ] | |
| text_lower = text.lower() | |
| return any(placeholder in text_lower for placeholder in placeholders) | |
| def _ensure_all_sections_present( | |
| self, content_sections: Dict[str, str], document_type: str, title: str | |
| ) -> Dict[str, str]: | |
| """ | |
| Ensure all required sections are present in the document. | |
| If missing, generate or provide guidance. | |
| """ | |
| from src.ai_engine import AdvancedContentGenerator | |
| required = self.required_sections_by_type.get( | |
| document_type.lower(), | |
| self.required_sections_by_type["research"] | |
| ) | |
| complete_sections = dict(content_sections) | |
| # Check for missing sections | |
| missing_sections = [s for s in required if s not in [k.lower() for k in content_sections.keys()]] | |
| if missing_sections: | |
| logger.warning(f"Missing sections detected: {missing_sections}") | |
| # Generate missing sections | |
| try: | |
| generator = AdvancedContentGenerator() | |
| for missing_section in missing_sections: | |
| logger.info(f"Generating missing section: {missing_section}") | |
| generated_content = generator.generate_complete_document_sections( | |
| sections=[missing_section], | |
| title=title, | |
| context="", | |
| topics=[title], | |
| style="academic", | |
| total_words=500, | |
| ) | |
| if generated_content: | |
| complete_sections[missing_section] = generated_content.get( | |
| missing_section, "" | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error generating missing sections: {e}") | |
| return complete_sections | |
| def _create_ordered_document( | |
| self, | |
| title: str, | |
| sections: Dict[str, str], | |
| author: str, | |
| include_toc: bool, | |
| include_citations: bool, | |
| citations: Optional[List[str]], | |
| ) -> Dict[str, str]: | |
| """Create properly ordered document with all components.""" | |
| ordered = {} | |
| # 1. Title | |
| ordered["Document Title"] = title | |
| # 2. Metadata | |
| ordered["Document Metadata"] = ( | |
| f"Author: {author}\n" | |
| f"Date: {datetime.now().strftime('%B %d, %Y')}\n" | |
| f"Document Type: Research Document\n" | |
| f"Status: Complete" | |
| ) | |
| # 3. Abstract/Executive Summary (if present) | |
| for key in ["Abstract", "Executive Summary", "abstract", "executive summary"]: | |
| if key in sections: | |
| ordered[key] = sections[key] | |
| break | |
| # 4. Table of Contents | |
| if include_toc: | |
| toc_content = self._generate_table_of_contents(sections) | |
| ordered["Table of Contents"] = toc_content | |
| # 5. Body sections (in logical order) | |
| body_section_order = [ | |
| "Introduction", | |
| "Literature Review", | |
| "Background", | |
| "Methodology", | |
| "Methods", | |
| "Results", | |
| "Findings", | |
| "Discussion", | |
| "Analysis", | |
| "Recommendations", | |
| "Implications", | |
| "Conclusion", | |
| "Conclusions", | |
| ] | |
| added_sections = set() | |
| # Add sections in preferred order | |
| for section in body_section_order: | |
| for key in sections: | |
| if key.lower() == section.lower() and key not in added_sections: | |
| ordered[key] = sections[key] | |
| added_sections.add(key) | |
| # Add any remaining sections not in preferred order | |
| for key, content in sections.items(): | |
| if key not in added_sections and key not in ordered: | |
| ordered[key] = content | |
| # 6. References/Bibliography | |
| if include_citations and citations: | |
| references_content = "\n\n".join( | |
| f"{i}. {citation}" for i, citation in enumerate(citations, 1) | |
| ) | |
| ordered["References"] = references_content | |
| return ordered | |
| def _generate_table_of_contents(self, sections: Dict[str, str]) -> str: | |
| """Generate table of contents from sections.""" | |
| toc_lines = ["# Table of Contents\n"] | |
| for i, section_name in enumerate(sections.keys(), 1): | |
| # Skip metadata and title from TOC | |
| if section_name not in ["Document Title", "Document Metadata"]: | |
| toc_lines.append(f"{i}. {section_name}") | |
| return "\n".join(toc_lines) | |
| def _validate_final_document(self, document: Dict[str, str]) -> List[str]: | |
| """Validate final assembled document.""" | |
| messages = [] | |
| if not document: | |
| messages.append("❌ ERROR: Document is empty") | |
| return messages | |
| # Check for minimum content | |
| total_chars = sum(len(v) for v in document.values()) | |
| if total_chars < 1000: | |
| messages.append( | |
| f"⚠️ WARNING: Document is very short ({total_chars} characters)" | |
| ) | |
| # Verify key sections exist | |
| section_names = [k.lower() for k in document.keys()] | |
| if not any( | |
| name in section_names for name in ["introduction", "conclusion", "results", "findings"] | |
| ): | |
| messages.append("⚠️ WARNING: Missing core content sections") | |
| # Success message | |
| messages.append( | |
| f"✅ Document assembled successfully with {len(document)} sections " | |
| f"({total_chars} total characters)" | |
| ) | |
| return messages | |
| def validate_document_completeness(self, document: Dict[str, str]) -> Tuple[bool, List[str]]: | |
| """ | |
| Validate that document is complete and ready for output. | |
| Returns: | |
| Tuple of (is_complete, validation_messages) | |
| """ | |
| messages = [] | |
| # Check each section | |
| for section_name, content in document.items(): | |
| if not content: | |
| messages.append(f"❌ Empty section: {section_name}") | |
| elif len(content) < 100: | |
| messages.append(f"⚠️ Very short section: {section_name} ({len(content)} chars)") | |
| # Check overall completeness | |
| total_length = sum(len(c) for c in document.values()) | |
| section_count = len(document) | |
| if section_count < 3: | |
| messages.append(f"❌ Too few sections: {section_count} (expected minimum 4-6)") | |
| return False, messages | |
| if total_length < 2000: | |
| messages.append(f"⚠️ Document very short: {total_length} characters") | |
| # Generate success message with stats | |
| messages.insert(0, f"✅ Document Complete: {section_count} sections, {total_length} characters") | |
| return True, messages | |
| def get_section_statistics(self, document: Dict[str, str]) -> Dict[str, Any]: | |
| """Get statistics about document sections.""" | |
| stats = { | |
| "total_sections": len(document), | |
| "total_characters": sum(len(v) for v in document.values()), | |
| "total_words": sum(len(v.split()) for v in document.values()), | |
| "section_details": {}, | |
| } | |
| for section_name, content in document.items(): | |
| stats["section_details"][section_name] = { | |
| "characters": len(content), | |
| "words": len(content.split()), | |
| "paragraphs": len([p for p in content.split("\n\n") if p.strip()]), | |
| } | |
| return stats | |