Spaces:

brickfrog
/

ankigen

Build error

App Files Files Community

brickfrog commited on Nov 25, 2025

Commit

2ec553e

verified ·

1 Parent(s): 06f924e

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitignore +1 -0
ankigen_core/agents/base.py +82 -72
ankigen_core/agents/generators.py +78 -68
ankigen_core/agents/token_tracker.py +20 -9
ankigen_core/card_generator.py +105 -100
ankigen_core/context7.py +136 -109
ankigen_core/crawler.py +135 -81
ankigen_core/ui_logic.py +147 -126

.gitignore CHANGED Viewed

@@ -199,3 +199,4 @@ scripts/
 .taskmasterconfig
 .cursor
 .serena/

 .taskmasterconfig
 .cursor
 .serena/
+.serena/

ankigen_core/agents/base.py CHANGED Viewed

@@ -100,30 +100,17 @@ class BaseAgentWrapper:
             logger.error(f"Failed to initialize agent {self.config.name}: {e}")
             raise
-    async def execute(
-        self, user_input: str, context: Optional[Dict[str, Any]] = None
-    ) -> tuple[Any, Dict[str, Any]]:
-        """Execute the agent with user input and optional context"""
-        if not self.agent:
-            await self.initialize()
-        # Add context to the user input if provided
-        enhanced_input = user_input
-        if context is not None:
-            context_str = "\n".join([f"{k}: {v}" for k, v in context.items()])
-            enhanced_input = f"{user_input}\n\nContext:\n{context_str}"
-        # Execute the agent using Runner.run() with retry logic
-        if self.agent is None:
-            raise ValueError("Agent not initialized")
-        logger.info(f"🤖 EXECUTING AGENT: {self.config.name}")
-        logger.info(f"📝 INPUT: {enhanced_input[:200]}...")
-        import time
-        start_time = time.time()
         for attempt in range(self.config.retry_attempts):
             try:
                 result = await asyncio.wait_for(
@@ -133,63 +120,86 @@ class BaseAgentWrapper:
                     ),
                     timeout=self.config.timeout,
                 )
-                break
             except asyncio.TimeoutError:
                 if attempt < self.config.retry_attempts - 1:
                     logger.warning(
-                        f"Agent {self.config.name} timed out (attempt {attempt + 1}/{self.config.retry_attempts}), retrying..."
                     )
                     continue
-                else:
-                    logger.error(
-                        f"Agent {self.config.name} timed out after {self.config.retry_attempts} attempts"
-                    )
-                    raise
-        try:
-            execution_time = time.time() - start_time
             logger.info(
-                f"Agent {self.config.name} executed successfully in {execution_time:.2f}s"
             )
-            # Extract usage information from raw_responses
-            total_usage = {
-                "input_tokens": 0,
-                "output_tokens": 0,
-                "total_tokens": 0,
-                "requests": 0,
-            }
-            if hasattr(result, "raw_responses") and result.raw_responses:
-                for response in result.raw_responses:
-                    if hasattr(response, "usage") and response.usage:
-                        total_usage["input_tokens"] += response.usage.input_tokens
-                        total_usage["output_tokens"] += response.usage.output_tokens
-                        total_usage["total_tokens"] += response.usage.total_tokens
-                        total_usage["requests"] += response.usage.requests
-                # Track usage with the token tracker
-                track_usage_from_agents_sdk(total_usage, self.config.model)
-                logger.info(f"💰 AGENT USAGE: {total_usage}")
-            # Extract the final output from the result
-            if hasattr(result, "new_items") and result.new_items:
-                # Get the last message content
-                from agents.items import ItemHelpers
-                text_output = ItemHelpers.text_message_outputs(result.new_items)
-                # If we have structured output, the response should already be parsed
-                if self.config.output_type and self.config.output_type is not str:
-                    logger.info(
-                        f"✅ STRUCTURED OUTPUT: {type(text_output)} -> {self.config.output_type}"
-                    )
-                    # The agents SDK should return the structured object directly
-                    return text_output, total_usage
-                else:
-                    return text_output, total_usage
-            else:
-                return str(result), total_usage
         except asyncio.TimeoutError:
             logger.error(

             logger.error(f"Failed to initialize agent {self.config.name}: {e}")
             raise
+    def _enhance_input_with_context(
+        self, user_input: str, context: Optional[Dict[str, Any]]
+    ) -> str:
+        """Add context to user input if provided."""
+        if context is None:
+            return user_input
+        context_str = "\n".join([f"{k}: {v}" for k, v in context.items()])
+        return f"{user_input}\n\nContext:\n{context_str}"
+    async def _execute_with_retry(self, enhanced_input: str) -> Any:
+        """Execute agent with retry logic on timeout."""
         for attempt in range(self.config.retry_attempts):
             try:
                 result = await asyncio.wait_for(
                     ),
                     timeout=self.config.timeout,
                 )
+                return result
             except asyncio.TimeoutError:
                 if attempt < self.config.retry_attempts - 1:
                     logger.warning(
+                        f"Agent {self.config.name} timed out "
+                        f"(attempt {attempt + 1}/{self.config.retry_attempts}), retrying..."
                     )
                     continue
+                logger.error(
+                    f"Agent {self.config.name} timed out after {self.config.retry_attempts} attempts"
+                )
+                raise
+        raise RuntimeError("Retry loop exited without result")
+    def _extract_and_track_usage(self, result: Any) -> Dict[str, Any]:
+        """Extract usage info from result and track it."""
+        total_usage = {
+            "input_tokens": 0,
+            "output_tokens": 0,
+            "total_tokens": 0,
+            "requests": 0,
+        }
+        if hasattr(result, "raw_responses") and result.raw_responses:
+            for response in result.raw_responses:
+                if hasattr(response, "usage") and response.usage:
+                    total_usage["input_tokens"] += response.usage.input_tokens
+                    total_usage["output_tokens"] += response.usage.output_tokens
+                    total_usage["total_tokens"] += response.usage.total_tokens
+                    total_usage["requests"] += response.usage.requests
+            track_usage_from_agents_sdk(total_usage, self.config.model)
+            logger.info(f"Agent usage: {total_usage}")
+        return total_usage
+    def _extract_output(self, result: Any) -> Any:
+        """Extract final output from agent result."""
+        if not (hasattr(result, "new_items") and result.new_items):
+            return str(result)
+        from agents.items import ItemHelpers
+        text_output = ItemHelpers.text_message_outputs(result.new_items)
+        if self.config.output_type and self.config.output_type is not str:
             logger.info(
+                f"Structured output: {type(text_output)} -> {self.config.output_type}"
             )
+        return text_output
+    async def execute(
+        self, user_input: str, context: Optional[Dict[str, Any]] = None
+    ) -> tuple[Any, Dict[str, Any]]:
+        """Execute the agent with user input and optional context."""
+        if not self.agent:
+            await self.initialize()
+        if self.agent is None:
+            raise ValueError("Agent not initialized")
+        enhanced_input = self._enhance_input_with_context(user_input, context)
+        logger.info(f"Executing agent: {self.config.name}")
+        logger.info(f"Input: {enhanced_input[:200]}...")
+        import time
+        start_time = time.time()
+        try:
+            result = await self._execute_with_retry(enhanced_input)
+            execution_time = time.time() - start_time
+            logger.info(f"Agent {self.config.name} executed in {execution_time:.2f}s")
+            total_usage = self._extract_and_track_usage(result)
+            output = self._extract_output(result)
+            return output, total_usage
         except asyncio.TimeoutError:
             logger.error(

ankigen_core/agents/generators.py CHANGED Viewed

@@ -67,10 +67,8 @@ class SubjectExpertAgent(BaseAgentWrapper):
                 "subject_expert configuration not found - agent system not properly initialized"
             )
-        # Enable structured output for card generation
         base_config.output_type = CardsGenerationSchema
-        # Customize instructions for the specific subject
         if subject != "general" and base_config.custom_prompts:
             subject_prompt = base_config.custom_prompts.get(subject.lower(), "")
             if subject_prompt:
@@ -81,102 +79,114 @@ class SubjectExpertAgent(BaseAgentWrapper):
         super().__init__(base_config, openai_client)
         self.subject = subject
     async def generate_cards(
         self, topic: str, num_cards: int = 5, context: Optional[Dict[str, Any]] = None
     ) -> List[Card]:
-        """Generate flashcards for a given topic with automatic batching for large requests"""
-        try:
-            # Use batching for large numbers of cards to avoid LLM limitations
-            batch_size = 10  # Generate max 10 cards per batch
-            all_cards = []
-            total_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-            cards_remaining = num_cards
-            batch_num = 1
-            logger.info(
-                f"Generating {num_cards} cards for topic '{topic}' using {((num_cards - 1) // batch_size) + 1} batches"
-            )
-            # Track card topics from previous batches to avoid duplication
-            previous_card_topics = []
             while cards_remaining > 0:
-                cards_in_this_batch = min(batch_size, cards_remaining)
-                logger.info(
-                    f"Generating batch {batch_num}: {cards_in_this_batch} cards"
-                )
-                # Initialize agent only once - Runner.run() creates fresh context each time
-                # No conversation history accumulation across batches (significant performance gain)
                 if not self.agent:
                     await self.initialize()
-                user_input = (
-                    f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
                 )
-                # Add cloze generation instruction if enabled
-                if context and context.get("generate_cloze"):
-                    user_input += "\n\nIMPORTANT: Generate a mix of card types including cloze cards. For code examples, syntax, and fill-in-the-blank concepts, use cloze cards (card_type='cloze'). Aim for roughly 50% cloze cards when dealing with technical/programming content."
-                if context:
-                    user_input += f"\n\nAdditional context: {context}"
-                # Add previous topics to avoid repetition instead of full conversation history
-                if previous_card_topics:
-                    topics_summary = ", ".join(
-                        previous_card_topics[-20:]
-                    )  # Last 20 topics to keep it manageable
-                    user_input += f"\n\nAvoid creating cards about these already covered topics: {topics_summary}"
-                if batch_num > 1:
-                    user_input += f"\n\nThis is batch {batch_num} of cards. Ensure these cards cover different aspects of the topic."
                 response, usage = await self.execute(user_input, context)
-                # Accumulate usage information
-                if usage:
-                    for key in total_usage:
-                        total_usage[key] += usage.get(key, 0)
                 batch_cards = self._parse_cards_response(response, topic)
                 all_cards.extend(batch_cards)
-                # Extract topics from generated cards to avoid duplication in next batch
-                for card in batch_cards:
-                    if hasattr(card, "front") and card.front and card.front.question:
-                        # Extract key terms from the question for deduplication
-                        question_words = card.front.question.lower().split()
-                        key_terms = [word for word in question_words if len(word) > 3][
-                            :3
-                        ]  # First 3 meaningful words
-                        if key_terms:
-                            previous_card_topics.append(" ".join(key_terms))
                 cards_remaining -= len(batch_cards)
-                batch_num += 1
                 logger.info(
-                    f"Batch {batch_num - 1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
                 )
-                # Safety check to prevent infinite loops
                 if len(batch_cards) == 0:
-                    logger.warning(
-                        f"No cards generated in batch {batch_num - 1}, stopping generation"
-                    )
                     break
-            # Log final usage information
             if total_usage.get("total_tokens", 0) > 0:
                 logger.info(
-                    f"💰 Total Token Usage: {total_usage['total_tokens']} tokens (Input: {total_usage['input_tokens']}, Output: {total_usage['output_tokens']})"
                 )
             logger.info(
-                f"✅ Generated {len(all_cards)} cards total across {batch_num - 1} batches for topic '{topic}'"
             )
             return all_cards

                 "subject_expert configuration not found - agent system not properly initialized"
             )
         base_config.output_type = CardsGenerationSchema
         if subject != "general" and base_config.custom_prompts:
             subject_prompt = base_config.custom_prompts.get(subject.lower(), "")
             if subject_prompt:
         super().__init__(base_config, openai_client)
         self.subject = subject
+    def _build_batch_prompt(
+        self,
+        topic: str,
+        cards_in_batch: int,
+        batch_num: int,
+        context: Optional[Dict[str, Any]],
+        previous_topics: List[str],
+    ) -> str:
+        """Build user input prompt for a batch of cards."""
+        user_input = f"Generate {cards_in_batch} flashcards for the topic: {topic}"
+        if context and context.get("generate_cloze"):
+            user_input += (
+                "\n\nIMPORTANT: Generate a mix of card types including cloze cards. "
+                "For code examples, syntax, and fill-in-the-blank concepts, use cloze cards "
+                "(card_type='cloze'). Aim for roughly 50% cloze cards when dealing with technical/programming content."
+            )
+        if context:
+            user_input += f"\n\nAdditional context: {context}"
+        if previous_topics:
+            topics_summary = ", ".join(previous_topics[-20:])
+            user_input += f"\n\nAvoid creating cards about these already covered topics: {topics_summary}"
+        if batch_num > 1:
+            user_input += f"\n\nThis is batch {batch_num} of cards. Ensure these cards cover different aspects of the topic."
+        return user_input
+    def _extract_topics_for_dedup(self, batch_cards: List[Card]) -> List[str]:
+        """Extract key terms from card questions for deduplication."""
+        topics = []
+        for card in batch_cards:
+            if hasattr(card, "front") and card.front and card.front.question:
+                question_words = card.front.question.lower().split()
+                key_terms = [word for word in question_words if len(word) > 3][:3]
+                if key_terms:
+                    topics.append(" ".join(key_terms))
+        return topics
+    def _accumulate_usage(
+        self, total_usage: Dict[str, int], batch_usage: Optional[Dict[str, Any]]
+    ) -> None:
+        """Accumulate batch usage into total usage."""
+        if batch_usage:
+            for key in total_usage:
+                total_usage[key] += batch_usage.get(key, 0)
     async def generate_cards(
         self, topic: str, num_cards: int = 5, context: Optional[Dict[str, Any]] = None
     ) -> List[Card]:
+        """Generate flashcards for a given topic with automatic batching."""
+        batch_size = 10
+        all_cards: List[Card] = []
+        total_usage: Dict[str, int] = {
+            "total_tokens": 0,
+            "input_tokens": 0,
+            "output_tokens": 0,
+        }
+        previous_topics: List[str] = []
+        cards_remaining = num_cards
+        batch_num = 1
+        num_batches = ((num_cards - 1) // batch_size) + 1
+        logger.info(
+            f"Generating {num_cards} cards for '{topic}' using {num_batches} batches"
+        )
+        try:
             while cards_remaining > 0:
+                cards_in_batch = min(batch_size, cards_remaining)
+                logger.info(f"Generating batch {batch_num}: {cards_in_batch} cards")
                 if not self.agent:
                     await self.initialize()
+                user_input = self._build_batch_prompt(
+                    topic, cards_in_batch, batch_num, context, previous_topics
                 )
                 response, usage = await self.execute(user_input, context)
+                self._accumulate_usage(total_usage, usage)
                 batch_cards = self._parse_cards_response(response, topic)
                 all_cards.extend(batch_cards)
+                previous_topics.extend(self._extract_topics_for_dedup(batch_cards))
                 cards_remaining -= len(batch_cards)
                 logger.info(
+                    f"Batch {batch_num} generated {len(batch_cards)} cards. {cards_remaining} remaining."
                 )
                 if len(batch_cards) == 0:
+                    logger.warning(f"No cards generated in batch {batch_num}, stopping")
                     break
+                batch_num += 1
             if total_usage.get("total_tokens", 0) > 0:
                 logger.info(
+                    f"Total usage: {total_usage['total_tokens']} tokens "
+                    f"(Input: {total_usage['input_tokens']}, Output: {total_usage['output_tokens']})"
                 )
             logger.info(
+                f"Generated {len(all_cards)} cards across {batch_num} batches for '{topic}'"
             )
             return all_cards

ankigen_core/agents/token_tracker.py CHANGED Viewed

@@ -34,6 +34,25 @@ class TokenTracker:
     def count_tokens_for_messages(
         self, messages: List[Dict[str, str]], model: str
     ) -> int:
         try:
             encoding = tiktoken.encoding_for_model(model)
         except KeyError:
@@ -61,11 +80,6 @@ class TokenTracker:
         return len(encoding.encode(text))
-    def estimate_cost(
-        self, prompt_tokens: int, completion_tokens: int, model: str
-    ) -> Optional[float]:
-        return None
     def track_usage_from_response(
         self, response_data, model: str
     ) -> Optional[TokenUsage]:
@@ -98,10 +112,7 @@ class TokenTracker:
     ) -> TokenUsage:
         total_tokens = prompt_tokens + completion_tokens
-        if actual_cost is not None:
-            final_cost = actual_cost
-        else:
-            final_cost = self.estimate_cost(prompt_tokens, completion_tokens, model)
         usage = TokenUsage(
             prompt_tokens=prompt_tokens,

     def count_tokens_for_messages(
         self, messages: List[Dict[str, str]], model: str
     ) -> int:
+        """
+        Count total tokens for a list of chat messages using tiktoken.
+        Implements OpenAI's token counting algorithm for chat completions:
+        - Each message adds 3 tokens for role/content/structure overhead
+        - Message names add an additional token
+        - The entire message list adds 3 tokens for conversation wrapper
+        The encoding is selected based on the model:
+        - Attempts to use model-specific encoding via tiktoken
+        - Falls back to 'o200k_base' (GPT-4 Turbo encoding) for unknown models
+        Args:
+            messages: List of message dicts (each with 'role', 'content', optional 'name')
+            model: OpenAI model identifier (e.g., 'gpt-4.1', 'gpt-4o')
+        Returns:
+            Total tokens required to send these messages to the model
+        """
         try:
             encoding = tiktoken.encoding_for_model(model)
         except KeyError:
         return len(encoding.encode(text))
     def track_usage_from_response(
         self, response_data, model: str
     ) -> Optional[TokenUsage]:
     ) -> TokenUsage:
         total_tokens = prompt_tokens + completion_tokens
+        final_cost = actual_cost  # Cost estimation removed - rely on API-provided costs
         usage = TokenUsage(
             prompt_tokens=prompt_tokens,

ankigen_core/card_generator.py CHANGED Viewed

@@ -70,10 +70,58 @@ GENERATION_MODES = [
 # Legacy functions removed - all card generation now handled by agent system
-async def orchestrate_card_generation(  # MODIFIED: Added async
-    client_manager: OpenAIClientManager,  # Expect the manager
-    cache: ResponseCache,  # Expect the cache instance
-    # --- UI Inputs --- (These will be passed from app.py handler)
     api_key_input: str,
     subject: str,
     generation_mode: str,
@@ -89,109 +137,66 @@ async def orchestrate_card_generation(  # MODIFIED: Added async
     library_topic: str = None,
 ):
     """Orchestrates the card generation process based on UI inputs."""
     logger.info(f"Starting card generation orchestration in {generation_mode} mode")
     logger.debug(
-        f"Parameters: mode={generation_mode}, topics={topic_number}, cards_per_topic={cards_per_topic}, cloze={generate_cloze}"
     )
-    # --- AGENT SYSTEM INTEGRATION ---
-    if AGENTS_AVAILABLE:
-        logger.info("🤖 Using agent system for card generation")
-        try:
-            from ankigen_core.agents.token_tracker import get_token_tracker
-            token_tracker = get_token_tracker()
-            orchestrator = AgentOrchestrator(client_manager)
-            logger.info(f"Using {model_name} for SubjectExpertAgent")
-            await orchestrator.initialize(api_key_input, {"subject_expert": model_name})
-            # Map generation mode to subject
-            agent_subject = "general"
-            if generation_mode == "subject":
-                agent_subject = subject if subject else "general"
-            elif generation_mode == "path":
-                agent_subject = "curriculum_design"
-            elif generation_mode == "text":
-                agent_subject = "content_analysis"
-            total_cards_needed = topic_number * cards_per_topic
-            context = {}
-            if generation_mode == "text" and source_text:
-                context["source_text"] = source_text
-            agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
-                topic=subject if subject else "Mixed Topics",
-                subject=agent_subject,
-                num_cards=total_cards_needed,
-                difficulty="intermediate",
-                context=context,
-                library_name=library_name,
-                library_topic=library_topic,
-                generate_cloze=generate_cloze,
-            )
-            # Get token usage from session
-            try:
-                # Try both method names for compatibility
-                if hasattr(token_tracker, "get_session_summary"):
-                    token_usage = token_tracker.get_session_summary()
-                elif hasattr(token_tracker, "get_session_usage"):
-                    token_usage = token_tracker.get_session_usage()
-                else:
-                    raise AttributeError("TokenTracker has no session summary method")
-                token_usage_html = f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>"
-            except Exception as e:
-                logger.error(f"Token usage collection failed: {e}")
-                token_usage_html = "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"
-            # Convert agent cards to dataframe format
-            if agent_cards:
-                formatted_cards = format_cards_for_dataframe(
-                    agent_cards,
-                    topic_name=subject if subject else "General",
-                    start_index=1,
-                )
-                output_df = pd.DataFrame(
-                    formatted_cards, columns=get_dataframe_columns()
-                )
-                total_cards_message = f"<div><b>Cards Generated:</b> <span id='total-cards-count'>{len(output_df)}</span></div>"
-                logger.info(
-                    f"Agent system generated {len(output_df)} cards successfully"
-                )
-                return output_df, total_cards_message, token_usage_html
-            else:
-                logger.error("Agent system returned no cards")
-                gr.Error("🤖 Agent system returned no cards")
-                return (
-                    pd.DataFrame(columns=get_dataframe_columns()),
-                    "Agent system returned no cards.",
-                    "",
-                )
-        except Exception as e:
-            logger.error(f"Agent system failed: {e}")
-            gr.Error(f"🤖 Agent system error: {str(e)}")
-            return (
-                pd.DataFrame(columns=get_dataframe_columns()),
-                f"Agent system error: {str(e)}",
-                "",
             )
-    # Agent system is required and should never fail to be available
-    logger.error("Agent system failed but is required - this should not happen")
-    gr.Error("Agent system is required but not available")
-    return (
-        pd.DataFrame(columns=get_dataframe_columns()),
-        "Agent system error",
-        "",
-    )
 # Legacy helper functions removed - all processing now handled by agent system

 # Legacy functions removed - all card generation now handled by agent system
+def _map_generation_mode_to_subject(generation_mode: str, subject: str) -> str:
+    """Map UI generation mode to agent subject."""
+    if generation_mode == "subject":
+        return subject if subject else "general"
+    elif generation_mode == "path":
+        return "curriculum_design"
+    elif generation_mode == "text":
+        return "content_analysis"
+    return "general"
+def _build_generation_context(generation_mode: str, source_text: str) -> Dict[str, Any]:
+    """Build context dict for card generation."""
+    context: Dict[str, Any] = {}
+    if generation_mode == "text" and source_text:
+        context["source_text"] = source_text
+    return context
+def _get_token_usage_html(token_tracker) -> str:
+    """Extract token usage and format as HTML."""
+    try:
+        if hasattr(token_tracker, "get_session_summary"):
+            token_usage = token_tracker.get_session_summary()
+        elif hasattr(token_tracker, "get_session_usage"):
+            token_usage = token_tracker.get_session_usage()
+        else:
+            raise AttributeError("TokenTracker has no session summary method")
+        return f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>"
+    except Exception as e:
+        logger.error(f"Token usage collection failed: {e}")
+        return "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"
+def _format_cards_to_dataframe(
+    agent_cards: List[Card], subject: str
+) -> tuple[pd.DataFrame, str]:
+    """Format agent cards to DataFrame and generate message."""
+    formatted_cards = format_cards_for_dataframe(
+        agent_cards,
+        topic_name=subject if subject else "General",
+        start_index=1,
+    )
+    output_df = pd.DataFrame(formatted_cards, columns=get_dataframe_columns())
+    total_cards_message = f"<div><b>Cards Generated:</b> <span id='total-cards-count'>{len(output_df)}</span></div>"
+    return output_df, total_cards_message
+async def orchestrate_card_generation(
+    client_manager: OpenAIClientManager,
+    cache: ResponseCache,
     api_key_input: str,
     subject: str,
     generation_mode: str,
     library_topic: str = None,
 ):
     """Orchestrates the card generation process based on UI inputs."""
     logger.info(f"Starting card generation orchestration in {generation_mode} mode")
     logger.debug(
+        f"Parameters: mode={generation_mode}, topics={topic_number}, "
+        f"cards_per_topic={cards_per_topic}, cloze={generate_cloze}"
     )
+    if not AGENTS_AVAILABLE:
+        logger.error("Agent system is required but not available")
+        gr.Error("Agent system is required but not available")
+        return pd.DataFrame(columns=get_dataframe_columns()), "Agent system error", ""
+    try:
+        from ankigen_core.agents.token_tracker import get_token_tracker
+        token_tracker = get_token_tracker()
+        orchestrator = AgentOrchestrator(client_manager)
+        logger.info(f"Using {model_name} for SubjectExpertAgent")
+        await orchestrator.initialize(api_key_input, {"subject_expert": model_name})
+        agent_subject = _map_generation_mode_to_subject(generation_mode, subject)
+        context = _build_generation_context(generation_mode, source_text)
+        total_cards_needed = topic_number * cards_per_topic
+        agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
+            topic=subject if subject else "Mixed Topics",
+            subject=agent_subject,
+            num_cards=total_cards_needed,
+            difficulty="intermediate",
+            context=context,
+            library_name=library_name,
+            library_topic=library_topic,
+            generate_cloze=generate_cloze,
+        )
+        token_usage_html = _get_token_usage_html(token_tracker)
+        if agent_cards:
+            output_df, total_cards_message = _format_cards_to_dataframe(
+                agent_cards, subject
             )
+            logger.info(f"Agent system generated {len(output_df)} cards successfully")
+            return output_df, total_cards_message, token_usage_html
+        logger.error("Agent system returned no cards")
+        gr.Error("Agent system returned no cards")
+        return (
+            pd.DataFrame(columns=get_dataframe_columns()),
+            "Agent system returned no cards.",
+            "",
+        )
+    except Exception as e:
+        logger.error(f"Agent system failed: {e}")
+        gr.Error(f"Agent system error: {str(e)}")
+        return (
+            pd.DataFrame(columns=get_dataframe_columns()),
+            f"Agent system error: {str(e)}",
+            "",
+        )
 # Legacy helper functions removed - all processing now handled by agent system

ankigen_core/context7.py CHANGED Viewed

@@ -123,6 +123,129 @@ class Context7Client:
             logger.error(f"Error calling Context7 tool {tool_name}: {e}")
             return {"error": str(e), "success": False}
     async def resolve_library_id(self, library_name: str) -> Optional[str]:
         """Resolve a library name to a Context7-compatible ID"""
         logger.info(f"Resolving library ID for: {library_name}")
@@ -131,115 +254,19 @@ class Context7Client:
             "resolve-library-id", {"libraryName": library_name}
         )
-        if result and result.get("success") and result.get("text"):
-            text = result["text"]
-            # Parse the structured response format
-            libraries = []
-            lines = text.split("\n")
-            current_lib = {}
-            for line in lines:
-                line = line.strip()
-                # Parse title
-                if line.startswith("- Title:"):
-                    if current_lib and current_lib.get("id"):
-                        libraries.append(current_lib)
-                    current_lib = {
-                        "title": line.replace("- Title:", "").strip().lower()
-                    }
-                # Parse library ID
-                elif line.startswith("- Context7-compatible library ID:"):
-                    lib_id = line.replace(
-                        "- Context7-compatible library ID:", ""
-                    ).strip()
-                    if current_lib is not None:
-                        current_lib["id"] = lib_id
-                # Parse code snippets count
-                elif line.startswith("- Code Snippets:"):
-                    snippets_str = line.replace("- Code Snippets:", "").strip()
-                    try:
-                        snippets = int(snippets_str)
-                        if current_lib is not None:
-                            current_lib["snippets"] = snippets
-                    except ValueError:
-                        pass
-                # Parse trust score
-                elif line.startswith("- Trust Score:"):
-                    score_str = line.replace("- Trust Score:", "").strip()
-                    try:
-                        trust = float(score_str)
-                        if current_lib is not None:
-                            current_lib["trust"] = trust
-                    except ValueError:
-                        pass
-            # Add the last library if exists
-            if current_lib and current_lib.get("id"):
-                libraries.append(current_lib)
-            # If we found libraries, pick the best match
-            if libraries:
-                search_term = library_name.lower()
-                # Score each library
-                best_lib = None
-                best_score = -1
-                for lib in libraries:
-                    score = 0
-                    lib_title = lib.get("title", "")
-                    lib_id = lib["id"].lower()
-                    # Exact title match gets highest priority
-                    if lib_title == search_term:
-                        score += 10000
-                    # Check if it's exactly "pandas" in the path (not geopandas, etc)
-                    elif lib_id == f"/{search_term}-dev/{search_term}":
-                        score += 5000
-                    elif f"/{search_term}/" in lib_id or lib_id.endswith(
-                        f"/{search_term}"
-                    ):
-                        score += 2000
-                    # Partial title match (but penalize if it's a compound like "geopandas")
-                    elif search_term in lib_title:
-                        if lib_title == search_term:
-                            score += 1000
-                        elif lib_title.startswith(search_term):
-                            score += 200
-                        else:
-                            score += 50
-                    # Strong bonus for code snippets (indicates main library)
-                    snippets = lib.get("snippets", 0)
-                    score += snippets / 10  # Pandas has 7386 snippets
-                    # Significant bonus for trust score (high trust = official/authoritative)
-                    trust = lib.get("trust", 0)
-                    score += trust * 100  # Trust 9.2 = 920 points, Trust 7 = 700 points
-                    # Debug logging
-                    if search_term in lib_title or search_term in lib_id:
-                        logger.debug(
-                            f"Scoring {lib['id']}: title='{lib_title}', snippets={snippets}, "
-                            f"trust={trust}, score={score:.2f}"
-                        )
-                    if score > best_score:
-                        best_score = score
-                        best_lib = lib
-                if best_lib:
-                    logger.info(
-                        f"Resolved '{library_name}' to ID: {best_lib['id']} "
-                        f"(title: {best_lib.get('title', 'unknown')}, snippets: {best_lib.get('snippets', 0)}, "
-                        f"trust: {best_lib.get('trust', 0)}, score: {best_score:.2f})"
-                    )
-                    return best_lib["id"]
         logger.warning(f"Could not resolve library ID for '{library_name}'")
         return None

             logger.error(f"Error calling Context7 tool {tool_name}: {e}")
             return {"error": str(e), "success": False}
+    def _parse_library_response(self, text: str) -> list[Dict[str, Any]]:
+        """Parse Context7 response text into list of library dicts.
+        Args:
+            text: Raw text response from Context7
+        Returns:
+            List of library dicts with keys: title, id, snippets, trust
+        """
+        libraries = []
+        lines = text.split("\n")
+        current_lib: Dict[str, Any] = {}
+        for line in lines:
+            line = line.strip()
+            if line.startswith("- Title:"):
+                if current_lib and current_lib.get("id"):
+                    libraries.append(current_lib)
+                current_lib = {"title": line.replace("- Title:", "").strip().lower()}
+            elif line.startswith("- Context7-compatible library ID:"):
+                lib_id = line.replace("- Context7-compatible library ID:", "").strip()
+                if current_lib is not None:
+                    current_lib["id"] = lib_id
+            elif line.startswith("- Code Snippets:"):
+                snippets_str = line.replace("- Code Snippets:", "").strip()
+                try:
+                    if current_lib is not None:
+                        current_lib["snippets"] = int(snippets_str)
+                except ValueError:
+                    pass
+            elif line.startswith("- Trust Score:"):
+                score_str = line.replace("- Trust Score:", "").strip()
+                try:
+                    if current_lib is not None:
+                        current_lib["trust"] = float(score_str)
+                except ValueError:
+                    pass
+        if current_lib and current_lib.get("id"):
+            libraries.append(current_lib)
+        return libraries
+    def _score_library(self, lib: Dict[str, Any], search_term: str) -> float:
+        """Score a library based on how well it matches the search term.
+        Args:
+            lib: Library dict with title, id, snippets, trust
+            search_term: Lowercase search term
+        Returns:
+            Score (higher is better match)
+        """
+        score = 0.0
+        lib_title = lib.get("title", "")
+        lib_id = lib["id"].lower()
+        # Exact title match gets highest priority
+        if lib_title == search_term:
+            score += 10000
+        elif lib_id == f"/{search_term}-dev/{search_term}":
+            score += 5000
+        elif f"/{search_term}/" in lib_id or lib_id.endswith(f"/{search_term}"):
+            score += 2000
+        elif search_term in lib_title:
+            if lib_title == search_term:
+                score += 1000
+            elif lib_title.startswith(search_term):
+                score += 200
+            else:
+                score += 50
+        # Bonus for code snippets (indicates main library)
+        snippets = lib.get("snippets", 0)
+        score += snippets / 10
+        # Bonus for trust score (high trust = official/authoritative)
+        trust = lib.get("trust", 0)
+        score += trust * 100
+        return score
+    def _select_best_library(
+        self, libraries: list[Dict[str, Any]], search_term: str
+    ) -> Optional[Dict[str, Any]]:
+        """Select the best matching library from a list.
+        Args:
+            libraries: List of library dicts
+            search_term: Lowercase search term
+        Returns:
+            Best matching library dict, or None if no match
+        """
+        best_lib = None
+        best_score = -1.0
+        for lib in libraries:
+            score = self._score_library(lib, search_term)
+            if search_term in lib.get("title", "") or search_term in lib["id"].lower():
+                logger.debug(
+                    f"Scoring {lib['id']}: title='{lib.get('title', '')}', "
+                    f"snippets={lib.get('snippets', 0)}, trust={lib.get('trust', 0)}, score={score:.2f}"
+                )
+            if score > best_score:
+                best_score = score
+                best_lib = lib
+        if best_lib:
+            logger.info(
+                f"Selected library: {best_lib['id']} (title: {best_lib.get('title', 'unknown')}, "
+                f"snippets: {best_lib.get('snippets', 0)}, trust: {best_lib.get('trust', 0)}, "
+                f"score: {best_score:.2f})"
+            )
+        return best_lib
     async def resolve_library_id(self, library_name: str) -> Optional[str]:
         """Resolve a library name to a Context7-compatible ID"""
         logger.info(f"Resolving library ID for: {library_name}")
             "resolve-library-id", {"libraryName": library_name}
         )
+        if not (result and result.get("success") and result.get("text")):
+            logger.warning(f"Could not resolve library ID for '{library_name}'")
+            return None
+        libraries = self._parse_library_response(result["text"])
+        if not libraries:
+            logger.warning(f"Could not resolve library ID for '{library_name}'")
+            return None
+        best_lib = self._select_best_library(libraries, library_name.lower())
+        if best_lib:
+            logger.info(f"Resolved '{library_name}' to ID: {best_lib['id']}")
+            return best_lib["id"]
         logger.warning(f"Could not resolve library ID for '{library_name}'")
         return None

ankigen_core/crawler.py CHANGED Viewed

@@ -418,119 +418,173 @@ class WebCrawler:
         return False, None
     def crawl(
         self, progress_callback: Optional[Callable[[int, int, str], None]] = None
     ) -> List[CrawledPage]:
-        # Initialize URLs using helper method
         urls_to_visit = self._initialize_crawl_queue()
         crawled_pages: List[CrawledPage] = []
-        initial_total_for_progress = len(urls_to_visit)
         processed_count = 0
         while urls_to_visit:
             current_url, current_depth, current_parent_url = urls_to_visit.pop(0)
-            current_total_for_progress = (
-                initial_total_for_progress
-                if self.use_sitemap
-                else processed_count + len(urls_to_visit) + 1
             )
-            if progress_callback:
-                progress_callback(
-                    processed_count,
-                    current_total_for_progress,
-                    current_url,
-                )
-            # Check if URL should be skipped using helper method
             should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
             if should_skip:
-                if progress_callback and skip_reason:
-                    dynamic_total = (
-                        initial_total_for_progress
-                        if self.use_sitemap
-                        else processed_count + len(urls_to_visit) + 1
                     )
-                    progress_callback(processed_count, dynamic_total, skip_reason)
                 continue
             self.logger.info(
-                f"Crawling (Depth {current_depth}): {current_url} ({processed_count + 1}/{current_total_for_progress})"
             )
-            if progress_callback:
-                progress_callback(
-                    processed_count, current_total_for_progress, current_url
-                )
             self.visited_urls.add(current_url)
             self.rate_limiter.wait()
             try:
-                response = self.session.get(current_url, timeout=10)
-                response.raise_for_status()
-                html_content = response.text
-                soup = BeautifulSoup(html_content, "html.parser")
-                # Extract metadata using helper method
-                page_title, meta_description, meta_keywords = (
-                    self._extract_page_metadata(soup, current_url)
-                )
-                text_content = self._extract_text(soup)
-                page_data = CrawledPage(
-                    url=current_url,
-                    html_content=html_content,
-                    text_content=text_content,
-                    title=page_title,
-                    meta_description=meta_description,
-                    meta_keywords=meta_keywords,
-                    crawl_depth=current_depth,
-                    parent_url=current_parent_url,
                 )
                 crawled_pages.append(page_data)
                 self.logger.info(f"Successfully processed and stored: {current_url}")
-                if current_depth < self.max_depth:
-                    found_links = self._extract_links(soup, current_url)
-                    self.logger.debug(
-                        f"Found {len(found_links)} links on {current_url}"
-                    )
-                    for link in found_links:
-                        if link not in self.visited_urls:
-                            urls_to_visit.append((link, current_depth + 1, current_url))
-            except requests.exceptions.HTTPError as e:
-                self.logger.error(
-                    f"HTTPError for {current_url}: {e.response.status_code} - {e.response.reason}. Response: {e.response.text[:200]}...",
-                    exc_info=False,
-                )
-                processed_count += 1
-            except requests.exceptions.ConnectionError as e:
-                self.logger.error(
-                    f"ConnectionError for {current_url}: {e}", exc_info=False
-                )
-                processed_count += 1
-            except requests.exceptions.Timeout as e:
-                self.logger.error(f"Timeout for {current_url}: {e}", exc_info=False)
-                processed_count += 1
-            except requests.exceptions.RequestException as e:
-                self.logger.error(
-                    f"RequestException for {current_url}: {e}", exc_info=True
                 )
-                processed_count += 1
             except Exception as e:
-                self.logger.error(
-                    f"An unexpected error occurred while processing {current_url}: {e}",
-                    exc_info=True,
-                )
                 processed_count += 1
         self.logger.info(
-            f"Crawl completed. Total pages processed/attempted: {processed_count}. Successfully crawled pages: {len(crawled_pages)}"
         )
         if progress_callback:
             progress_callback(processed_count, processed_count, "Crawling complete.")

         return False, None
+    def _calculate_progress_total(
+        self, processed_count: int, urls_to_visit_len: int, initial_total: int
+    ) -> int:
+        """Calculate the total for progress reporting."""
+        if self.use_sitemap:
+            return initial_total
+        return processed_count + urls_to_visit_len + 1
+    def _update_crawl_progress(
+        self,
+        progress_callback: Optional[Callable[[int, int, str], None]],
+        processed_count: int,
+        urls_to_visit_len: int,
+        initial_total: int,
+        message: str,
+    ) -> None:
+        """Update progress callback if provided."""
+        if progress_callback:
+            total = self._calculate_progress_total(
+                processed_count, urls_to_visit_len, initial_total
+            )
+            progress_callback(processed_count, total, message)
+    def _fetch_and_parse_url(
+        self, url: str, depth: int, parent_url: Optional[str]
+    ) -> Tuple[CrawledPage, BeautifulSoup]:
+        """Fetch URL and create CrawledPage object.
+        Args:
+            url: URL to fetch
+            depth: Current crawl depth
+            parent_url: URL of the parent page
+        Returns:
+            Tuple of (CrawledPage, BeautifulSoup) for further processing
+        Raises:
+            requests.RequestException: If the HTTP request fails
+        """
+        response = self.session.get(url, timeout=10)
+        response.raise_for_status()
+        html_content = response.text
+        soup = BeautifulSoup(html_content, "html.parser")
+        page_title, meta_description, meta_keywords = self._extract_page_metadata(
+            soup, url
+        )
+        text_content = self._extract_text(soup)
+        return CrawledPage(
+            url=url,
+            html_content=html_content,
+            text_content=text_content,
+            title=page_title,
+            meta_description=meta_description,
+            meta_keywords=meta_keywords,
+            crawl_depth=depth,
+            parent_url=parent_url,
+        ), soup
+    def _enqueue_discovered_links(
+        self,
+        soup: BeautifulSoup,
+        current_url: str,
+        current_depth: int,
+        urls_to_visit: List[Tuple[str, int, Optional[str]]],
+    ) -> None:
+        """Extract links from page and add unvisited ones to queue."""
+        if current_depth >= self.max_depth:
+            return
+        found_links = self._extract_links(soup, current_url)
+        self.logger.debug(f"Found {len(found_links)} links on {current_url}")
+        for link in found_links:
+            if link not in self.visited_urls:
+                urls_to_visit.append((link, current_depth + 1, current_url))
+    def _handle_crawl_error(self, url: str, error: Exception) -> None:
+        """Log crawl error with appropriate detail level."""
+        if isinstance(error, requests.exceptions.HTTPError):
+            self.logger.error(
+                f"HTTPError for {url}: {error.response.status_code} - {error.response.reason}. "
+                f"Response: {error.response.text[:200]}...",
+                exc_info=False,
+            )
+        elif isinstance(error, requests.exceptions.ConnectionError):
+            self.logger.error(f"ConnectionError for {url}: {error}", exc_info=False)
+        elif isinstance(error, requests.exceptions.Timeout):
+            self.logger.error(f"Timeout for {url}: {error}", exc_info=False)
+        elif isinstance(error, requests.exceptions.RequestException):
+            self.logger.error(f"RequestException for {url}: {error}", exc_info=True)
+        else:
+            self.logger.error(
+                f"An unexpected error occurred while processing {url}: {error}",
+                exc_info=True,
+            )
     def crawl(
         self, progress_callback: Optional[Callable[[int, int, str], None]] = None
     ) -> List[CrawledPage]:
+        """Crawl website starting from the configured URL.
+        Args:
+            progress_callback: Optional callback for progress updates (processed, total, message)
+        Returns:
+            List of CrawledPage objects for successfully crawled pages
+        """
         urls_to_visit = self._initialize_crawl_queue()
         crawled_pages: List[CrawledPage] = []
+        initial_total = len(urls_to_visit)
         processed_count = 0
         while urls_to_visit:
             current_url, current_depth, current_parent_url = urls_to_visit.pop(0)
+            self._update_crawl_progress(
+                progress_callback,
+                processed_count,
+                len(urls_to_visit),
+                initial_total,
+                current_url,
             )
             should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
             if should_skip:
+                if skip_reason:
+                    self._update_crawl_progress(
+                        progress_callback,
+                        processed_count,
+                        len(urls_to_visit),
+                        initial_total,
+                        skip_reason,
                     )
                 continue
+            total = self._calculate_progress_total(
+                processed_count, len(urls_to_visit), initial_total
+            )
             self.logger.info(
+                f"Crawling (Depth {current_depth}): {current_url} ({processed_count + 1}/{total})"
             )
             self.visited_urls.add(current_url)
             self.rate_limiter.wait()
             try:
+                page_data, soup = self._fetch_and_parse_url(
+                    current_url, current_depth, current_parent_url
                 )
                 crawled_pages.append(page_data)
                 self.logger.info(f"Successfully processed and stored: {current_url}")
+                self._enqueue_discovered_links(
+                    soup, current_url, current_depth, urls_to_visit
                 )
             except Exception as e:
+                self._handle_crawl_error(current_url, e)
                 processed_count += 1
+                continue
+            processed_count += 1
         self.logger.info(
+            f"Crawl completed. Total pages processed/attempted: {processed_count}. "
+            f"Successfully crawled pages: {len(crawled_pages)}"
         )
         if progress_callback:
             progress_callback(processed_count, processed_count, "Crawling complete.")

ankigen_core/ui_logic.py CHANGED Viewed

@@ -3,7 +3,9 @@
 import gradio as gr
 import pandas as pd  # Needed for use_selected_subjects type hinting
 from typing import (
     List,
     Tuple,
 )
 from urllib.parse import urlparse
@@ -12,7 +14,7 @@ from urllib.parse import urlparse
 import re  # For URL validation and filename sanitization
 import asyncio
-from ankigen_core.crawler import WebCrawler
 from ankigen_core.llm_interface import (
     OpenAIClientManager,
 )
@@ -436,6 +438,132 @@ def _basic_sanitize_filename(name: str) -> str:
     return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)
 async def crawl_and_generate(
     url: str,
     max_depth: int,
@@ -453,145 +581,46 @@ async def crawl_and_generate(
     status_textbox: gr.Textbox,
 ) -> Tuple[str, List[dict], List[Card]]:
     """Crawls a website, generates Anki cards, and prepares them for export/display."""
-    # Initialize crawler_ui_logger if it's meant to be used here, e.g., at the start of the function
-    # For now, assuming it's available in the scope (e.g., global or passed in if it were a class)
-    # If it's a module-level logger, it should be fine.
-    # Ensure the status_textbox is updated via gr.Info or similar if needed
-    # as it's a parameter but not directly used for output updates in the provided snippet.
-    # It might be used by side-effect if gr.Info/gr.Warning updates it globally, or if it's part of `progress`.
-    # The `status_textbox` parameter is not directly used to set a value in the return,
-    # but `gr.Info` might update a default status area, or it's for other UI purposes.
     crawler_ui_logger.info(f"Crawl and generate called for URL: {url}")
-    if not url or not url.startswith(("http://", "https://")):
-        gr.Warning("Invalid URL provided. Please enter a valid http/https URL.")
         return "Invalid URL", [], []
     try:
-        urlparse(url)
-        # domain = parsed_url.netloc # allowed_domains is removed from WebCrawler call
-        # if not domain:
-        #     gr.Warning("Could not parse domain from URL. Please enter a valid URL.")
-        #     return "Invalid URL (cannot parse domain)", [], []
-        include_list = [p.strip() for p in include_patterns.split(",") if p.strip()]
-        exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]
-        # WebCrawler instantiation updated to remove parameters causing issues.
-        # The WebCrawler will use its defaults or other configured ways for these.
-        # The 'requests_per_second' from UI maps to 'delay_between_requests' internally if crawler supports it,
-        # but since 'delay_between_requests' was also flagged, we remove it.
-        # The WebCrawler class itself needs to be checked for its actual constructor parameters.
-        crawler = WebCrawler(
-            start_url=url,
-            max_depth=max_depth,  # Assuming max_depth is still a valid param
-            # allowed_domains=[domain], # Removed based on linter error
-            # delay_between_requests=1.0 / crawler_requests_per_second # Removed
-            # if crawler_requests_per_second > 0
-            # else 0.1,
-            # max_pages=500, # Removed
-            include_patterns=include_list,  # Assuming this is valid
-            exclude_patterns=exclude_list,  # Assuming this is valid
-            use_sitemap=use_sitemap,  # Assuming this is valid
-            sitemap_url=sitemap_url_str
-            if use_sitemap and sitemap_url_str and sitemap_url_str.strip()
-            else None,
-        )
-        total_urls_for_progress = 0
-        def crawler_progress_callback(
-            processed_count: int, total_urls: int, current_url_processing: str
-        ):
-            nonlocal total_urls_for_progress
-            total_urls_for_progress = total_urls
-            if total_urls_for_progress > 0:
-                progress(
-                    0.1 + (processed_count / total_urls_for_progress) * 0.4,
-                    desc=f"Crawling: {processed_count}/{total_urls_for_progress} URLs. Current: {current_url_processing}",
-                )
-            else:
-                progress(
-                    0.1 + processed_count * 0.01,
-                    desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url_processing}",
-                )
-        crawler_ui_logger.info(f"Starting crawl for {url}...")
-        progress(0.15, desc=f"Starting crawl for {url}...")
-        crawled_pages = await asyncio.to_thread(
-            crawler.crawl, progress_callback=crawler_progress_callback
         )
-        crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.")
-        progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.")
         if not crawled_pages:
             progress(1.0, desc="No pages were crawled. Check URL and patterns.")
-            # Return structure: (status_message, df_data, raw_cards_data)
             return (
                 "No pages were crawled. Check URL and patterns.",
                 pd.DataFrame().to_dict(orient="records"),
                 [],
             )
-        # --- AGENT SYSTEM INTEGRATION FOR WEB CRAWLING ---
-        crawler_ui_logger.info("🤖 Using agent system for web crawling card generation")
-        # Initialize agent orchestrator
-        orchestrator = AgentOrchestrator(client_manager)
-        await orchestrator.initialize("dummy-key")  # Key already in client_manager
-        # Combine all crawled content into a single context
-        combined_content = "\n\n--- PAGE BREAK ---\n\n".join(
-            [
-                f"URL: {page.url}\nTitle: {page.title}\nContent: {page.text_content[:2000]}..."
-                for page in crawled_pages[
-                    :10
-                ]  # Limit to first 10 pages to avoid token limits
-            ]
-        )
-        context = {
-            "source_text": combined_content,
-            "crawl_source": url,
-            "pages_crawled": len(crawled_pages),
-        }
-        progress(0.6, desc="🤖 Processing with agent system...")
-        # Generate cards with agents
-        agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
-            topic=f"Content from {url}",
-            subject="web_content",
-            num_cards=min(len(crawled_pages) * 3, 50),  # 3 cards per page, max 50
-            difficulty="intermediate",
-            enable_quality_pipeline=True,
-            context=context,
         )
         if agent_cards:
-            progress(0.9, desc=f"🤖 Agent system generated {len(agent_cards)} cards")
             cards_for_dataframe_export = generate_cards_from_crawled_content(
                 agent_cards
             )
-            final_message = f"🤖 Agent system processed content from {len(crawled_pages)} pages. Generated {len(agent_cards)} high-quality cards."
             progress(1.0, desc=final_message)
-            return (
-                final_message,
-                cards_for_dataframe_export,
-                agent_cards,
-            )
         else:
-            progress(1.0, desc="🤖 Agent system returned no cards")
-            return (
-                "Agent system returned no cards",
-                pd.DataFrame().to_dict(orient="records"),
-                [],
-            )
     except ConnectionError as e:
         crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True)
@@ -618,14 +647,6 @@ async def crawl_and_generate(
             [],
         )
-    final_message = f"Content crawled and processed. {len(cards_for_dataframe_export) if cards_for_dataframe_export else 0} potential cards prepared. Load them into the main table for review and export."
-    progress(1.0, desc=final_message)
-    return (
-        final_message,
-        cards_for_dataframe_export,
-        agent_cards,
-    )  # agent_cards is List[Card]
 # --- Card Preview and Editing Utilities (Task 13.3) ---

 import gradio as gr
 import pandas as pd  # Needed for use_selected_subjects type hinting
 from typing import (
+    Callable,
     List,
+    Optional,
     Tuple,
 )
 from urllib.parse import urlparse
 import re  # For URL validation and filename sanitization
 import asyncio
+from ankigen_core.crawler import CrawledPage, WebCrawler
 from ankigen_core.llm_interface import (
     OpenAIClientManager,
 )
     return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)
+def _validate_crawl_url(url: str) -> bool:
+    """Validate URL for crawling."""
+    if not url or not url.startswith(("http://", "https://")):
+        gr.Warning("Invalid URL provided. Please enter a valid http/https URL.")
+        return False
+    try:
+        urlparse(url)
+        return True
+    except Exception:
+        return False
+def _create_web_crawler(
+    url: str,
+    max_depth: int,
+    include_patterns: str,
+    exclude_patterns: str,
+    use_sitemap: bool,
+    sitemap_url_str: str,
+) -> WebCrawler:
+    """Create configured WebCrawler instance."""
+    include_list = [p.strip() for p in include_patterns.split(",") if p.strip()]
+    exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]
+    return WebCrawler(
+        start_url=url,
+        max_depth=max_depth,
+        include_patterns=include_list,
+        exclude_patterns=exclude_list,
+        use_sitemap=use_sitemap,
+        sitemap_url=sitemap_url_str
+        if use_sitemap and sitemap_url_str.strip()
+        else None,
+    )
+def _create_crawl_progress_callback(
+    progress: gr.Progress,
+) -> Tuple[Callable[[int, int, str], None], List[int]]:
+    """Create progress callback for crawler with mutable state container."""
+    total_urls_container = [0]  # Mutable container for nonlocal-like behavior
+    def callback(processed_count: int, total_urls: int, current_url: str):
+        total_urls_container[0] = total_urls
+        if total_urls_container[0] > 0:
+            progress(
+                0.1 + (processed_count / total_urls_container[0]) * 0.4,
+                desc=f"Crawling: {processed_count}/{total_urls_container[0]} URLs. Current: {current_url}",
+            )
+        else:
+            progress(
+                0.1 + processed_count * 0.01,
+                desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url}",
+            )
+    return callback, total_urls_container
+async def _perform_web_crawl(
+    crawler: WebCrawler,
+    progress: gr.Progress,
+    url: str,
+) -> Optional[List[CrawledPage]]:
+    """Execute web crawl and return pages or None if empty."""
+    callback, _ = _create_crawl_progress_callback(progress)
+    crawler_ui_logger.info(f"Starting crawl for {url}...")
+    progress(0.15, desc=f"Starting crawl for {url}...")
+    crawled_pages = await asyncio.to_thread(crawler.crawl, progress_callback=callback)
+    crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.")
+    progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.")
+    return crawled_pages if crawled_pages else None
+async def _process_crawled_with_agents(
+    crawled_pages: List[CrawledPage],
+    client_manager: OpenAIClientManager,
+    url: str,
+    progress: gr.Progress,
+) -> Tuple[List[Card], str]:
+    """Process crawled content with agent system."""
+    crawler_ui_logger.info("Using agent system for web crawling card generation")
+    orchestrator = AgentOrchestrator(client_manager)
+    # API key is already configured in client_manager, pass empty string as placeholder
+    await orchestrator.initialize("")
+    combined_content = "\n\n--- PAGE BREAK ---\n\n".join(
+        [
+            f"URL: {page.url}\nTitle: {page.title}\nContent: {page.text_content[:2000]}..."
+            for page in crawled_pages[:10]
+        ]
+    )
+    context = {
+        "source_text": combined_content,
+        "crawl_source": url,
+        "pages_crawled": len(crawled_pages),
+    }
+    progress(0.6, desc="Processing with agent system...")
+    agent_cards, _ = await orchestrator.generate_cards_with_agents(
+        topic=f"Content from {url}",
+        subject="web_content",
+        num_cards=min(len(crawled_pages) * 3, 50),
+        difficulty="intermediate",
+        enable_quality_pipeline=True,
+        context=context,
+    )
+    if agent_cards:
+        progress(0.9, desc=f"Agent system generated {len(agent_cards)} cards")
+        final_message = (
+            f"Agent system processed content from {len(crawled_pages)} pages. "
+            f"Generated {len(agent_cards)} high-quality cards."
+        )
+    else:
+        final_message = "Agent system returned no cards"
+    return agent_cards or [], final_message
 async def crawl_and_generate(
     url: str,
     max_depth: int,
     status_textbox: gr.Textbox,
 ) -> Tuple[str, List[dict], List[Card]]:
     """Crawls a website, generates Anki cards, and prepares them for export/display."""
     crawler_ui_logger.info(f"Crawl and generate called for URL: {url}")
+    if not _validate_crawl_url(url):
         return "Invalid URL", [], []
     try:
+        crawler = _create_web_crawler(
+            url,
+            max_depth,
+            include_patterns,
+            exclude_patterns,
+            use_sitemap,
+            sitemap_url_str,
         )
+        crawled_pages = await _perform_web_crawl(crawler, progress, url)
         if not crawled_pages:
             progress(1.0, desc="No pages were crawled. Check URL and patterns.")
             return (
                 "No pages were crawled. Check URL and patterns.",
                 pd.DataFrame().to_dict(orient="records"),
                 [],
             )
+        agent_cards, final_message = await _process_crawled_with_agents(
+            crawled_pages,
+            client_manager,
+            url,
+            progress,
         )
         if agent_cards:
             cards_for_dataframe_export = generate_cards_from_crawled_content(
                 agent_cards
             )
             progress(1.0, desc=final_message)
+            return final_message, cards_for_dataframe_export, agent_cards
         else:
+            progress(1.0, desc=final_message)
+            return final_message, pd.DataFrame().to_dict(orient="records"), []
     except ConnectionError as e:
         crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True)
             [],
         )
 # --- Card Preview and Editing Utilities (Task 13.3) ---