""" Content Generator - Generate academic content using AI models """ import re from typing import Dict, List, Optional, Any from textwrap import dedent import logging logger = logging.getLogger(__name__) class ContentGenerator: """ Generate academic content using Hugging Face models. """ def __init__(self, model_name: str = "HuggingFaceH4/zephyr-7b-beta"): """ Initialize content generator. Args: model_name: Hugging Face model identifier """ self.model_name = model_name self.pipeline = None self._init_model() def _init_model(self): """Initialize the language model.""" try: from transformers import pipeline self.pipeline = pipeline( "text-generation", model=self.model_name, device=-1, # Use CPU torch_dtype="auto", ) logger.info(f"Model {self.model_name} loaded successfully") except Exception as e: logger.warning(f"Model loading failed: {e}. Using fallback generation.") self.pipeline = None def generate_section( self, title: str, context: str = "", topic: str = "", word_count: int = 300, style: str = "academic", ) -> str: """ Generate a single document section. Args: title: Section title context: Additional context for generation topic: Main topic of the section word_count: Target word count style: Writing style (academic, formal, informal, etc.) Returns: Generated section content """ prompt = self._create_prompt(title, context, topic, style, word_count) if self.pipeline: return self._generate_with_model(prompt, word_count) else: return self._generate_fallback(title, topic, word_count) def _create_prompt( self, title: str, context: str, topic: str, style: str, word_count: int ) -> str: """Create generation prompt.""" prompt = dedent( f""" Write a {style} section titled "{title}" about {topic}. Context: {context} Requirements: - Approximately {word_count} words - Professional {style} tone - Well-structured with clear paragraphs - Informative and engaging Section Content: """ ) return prompt def _generate_with_model(self, prompt: str, word_count: int) -> str: """Generate using the loaded model.""" try: max_tokens = min(word_count // 4 + 100, 512) # Rough estimation: 4 chars per word result = self.pipeline( prompt, max_length=max_tokens, num_return_sequences=1, temperature=0.7, top_p=0.95, do_sample=True, ) if result and len(result) > 0: generated_text = result[0]["generated_text"] # Extract only the new content after the prompt content = generated_text[len(prompt) :].strip() return content if content else self._generate_fallback("Content", "", word_count) return self._generate_fallback("Content", "", word_count) except Exception as e: logger.warning(f"Generation failed: {e}. Using fallback.") return self._generate_fallback("Content", "", word_count) def _generate_fallback(self, title: str, topic: str, word_count: int) -> str: """Generate content using fallback method when model is unavailable.""" templates = { "introduction": "This section introduces the key concepts and provides context. ", "methodology": "This section describes the methods and approaches used. ", "results": "This section presents the key findings and outcomes. ", "discussion": "This section analyzes the implications and significance. ", "conclusion": "This section summarizes the main points and conclusions. ", "literature review": "This section reviews relevant existing research and scholarship. ", } title_lower = title.lower() base_text = templates.get(title_lower, f"This section discusses {topic}. ") # Generate paragraphs to reach target word count paragraphs = [] target_words = word_count while len(" ".join(paragraphs)) < target_words: paragraph = ( f"{base_text} " f"The significance of {topic} cannot be overstated in the context of modern {title.lower()}. " f"Through careful analysis and consideration, we find that multiple factors contribute to this outcome. " f"Furthermore, the evidence suggests that continued research and investigation in this area will yield valuable insights. " f"In conclusion, this aspect merits further attention from researchers and practitioners alike." ) paragraphs.append(paragraph) return " ".join(paragraphs)[: word_count * 4] # Rough character limit def generate_document_sections( self, sections: List[str], context: str = "", topics: List[str] = None, style: str = "academic", total_words: int = 2000, ) -> Dict[str, str]: """ Generate multiple sections for a complete document. Args: sections: List of section titles context: Document context topics: Topic for each section style: Writing style total_words: Target total word count Returns: Dictionary of section_title: content """ if topics is None: topics = [f"aspect {i}" for i in range(len(sections))] # Distribute words across sections words_per_section = total_words // len(sections) content = {} for section, topic in zip(sections, topics): section_content = self.generate_section( title=section, context=context, topic=topic, word_count=words_per_section, style=style, ) content[section] = section_content return content def improve_content(self, content: str) -> str: """ Improve existing content for better readability and flow. Args: content: Original content Returns: Improved content """ # Simple improvements without model improved = self._improve_sentences(content) improved = self._fix_grammar_basic(improved) improved = self._improve_flow(improved) return improved def _improve_sentences(self, text: str) -> str: """Improve sentence structure.""" # Break up overly long sentences sentences = re.split(r"(?<=[.!?])\s+", text) improved_sentences = [] for sent in sentences: if len(sent) > 200: # Split very long sentences parts = sent.split(",") if len(parts) > 2: improved_sentences.extend(parts) else: improved_sentences.append(sent) else: improved_sentences.append(sent) return " ".join(improved_sentences) def _fix_grammar_basic(self, text: str) -> str: """Apply basic grammar improvements.""" # Fix common issues text = re.sub(r"\b(a)\s+([aeiou])", r"an \2", text) # a -> an text = re.sub(r"\s+", " ", text) # Remove extra spaces text = re.sub(r"\s([.,;:])", r"\1", text) # Fix spacing before punctuation return text def _improve_flow(self, text: str) -> str: """Improve text flow and transitions.""" transitions = { r"^Therefore": "As a result", r"^However": "Nevertheless", r"^Also": "Additionally", r"^Finally": "In conclusion", } for pattern, replacement in transitions.items(): text = re.sub(pattern, replacement, text, flags=re.MULTILINE) return text def generate_outline(self, topic: str, sections: List[str]) -> Dict[str, List[str]]: """ Generate detailed outline for document. Args: topic: Main topic sections: Section titles Returns: Outline with key points per section """ outline = {} for section in sections: # Generate key points for each section key_points = [ f"Overview of {section.lower()}", f"Key aspects of {section.lower()}", f"Implications for {topic}", f"Current trends in {section.lower()}", f"Future directions for {section.lower()}", ] outline[section] = key_points[:3] # Select 3 key points per section return outline def estimate_tokens(self, text: str) -> int: """ Estimate token count for text. Args: text: Input text Returns: Estimated token count """ # Rough estimation: 1 token ≈ 4 characters return len(text) // 4