brickfrog commited on
Commit
2ec553e
·
verified ·
1 Parent(s): 06f924e

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -199,3 +199,4 @@ scripts/
199
  .taskmasterconfig
200
  .cursor
201
  .serena/
 
 
199
  .taskmasterconfig
200
  .cursor
201
  .serena/
202
+ .serena/
ankigen_core/agents/base.py CHANGED
@@ -100,30 +100,17 @@ class BaseAgentWrapper:
100
  logger.error(f"Failed to initialize agent {self.config.name}: {e}")
101
  raise
102
 
103
- async def execute(
104
- self, user_input: str, context: Optional[Dict[str, Any]] = None
105
- ) -> tuple[Any, Dict[str, Any]]:
106
- """Execute the agent with user input and optional context"""
107
- if not self.agent:
108
- await self.initialize()
109
-
110
- # Add context to the user input if provided
111
- enhanced_input = user_input
112
- if context is not None:
113
- context_str = "\n".join([f"{k}: {v}" for k, v in context.items()])
114
- enhanced_input = f"{user_input}\n\nContext:\n{context_str}"
115
-
116
- # Execute the agent using Runner.run() with retry logic
117
- if self.agent is None:
118
- raise ValueError("Agent not initialized")
119
-
120
- logger.info(f"🤖 EXECUTING AGENT: {self.config.name}")
121
- logger.info(f"📝 INPUT: {enhanced_input[:200]}...")
122
-
123
- import time
124
-
125
- start_time = time.time()
126
-
127
  for attempt in range(self.config.retry_attempts):
128
  try:
129
  result = await asyncio.wait_for(
@@ -133,63 +120,86 @@ class BaseAgentWrapper:
133
  ),
134
  timeout=self.config.timeout,
135
  )
136
- break
137
  except asyncio.TimeoutError:
138
  if attempt < self.config.retry_attempts - 1:
139
  logger.warning(
140
- f"Agent {self.config.name} timed out (attempt {attempt + 1}/{self.config.retry_attempts}), retrying..."
 
141
  )
142
  continue
143
- else:
144
- logger.error(
145
- f"Agent {self.config.name} timed out after {self.config.retry_attempts} attempts"
146
- )
147
- raise
 
 
 
 
 
 
 
 
 
148
 
149
- try:
150
- execution_time = time.time() - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  logger.info(
152
- f"Agent {self.config.name} executed successfully in {execution_time:.2f}s"
153
  )
154
 
155
- # Extract usage information from raw_responses
156
- total_usage = {
157
- "input_tokens": 0,
158
- "output_tokens": 0,
159
- "total_tokens": 0,
160
- "requests": 0,
161
- }
162
-
163
- if hasattr(result, "raw_responses") and result.raw_responses:
164
- for response in result.raw_responses:
165
- if hasattr(response, "usage") and response.usage:
166
- total_usage["input_tokens"] += response.usage.input_tokens
167
- total_usage["output_tokens"] += response.usage.output_tokens
168
- total_usage["total_tokens"] += response.usage.total_tokens
169
- total_usage["requests"] += response.usage.requests
170
-
171
- # Track usage with the token tracker
172
- track_usage_from_agents_sdk(total_usage, self.config.model)
173
- logger.info(f"💰 AGENT USAGE: {total_usage}")
174
-
175
- # Extract the final output from the result
176
- if hasattr(result, "new_items") and result.new_items:
177
- # Get the last message content
178
- from agents.items import ItemHelpers
179
-
180
- text_output = ItemHelpers.text_message_outputs(result.new_items)
181
-
182
- # If we have structured output, the response should already be parsed
183
- if self.config.output_type and self.config.output_type is not str:
184
- logger.info(
185
- f"✅ STRUCTURED OUTPUT: {type(text_output)} -> {self.config.output_type}"
186
- )
187
- # The agents SDK should return the structured object directly
188
- return text_output, total_usage
189
- else:
190
- return text_output, total_usage
191
- else:
192
- return str(result), total_usage
193
 
194
  except asyncio.TimeoutError:
195
  logger.error(
 
100
  logger.error(f"Failed to initialize agent {self.config.name}: {e}")
101
  raise
102
 
103
+ def _enhance_input_with_context(
104
+ self, user_input: str, context: Optional[Dict[str, Any]]
105
+ ) -> str:
106
+ """Add context to user input if provided."""
107
+ if context is None:
108
+ return user_input
109
+ context_str = "\n".join([f"{k}: {v}" for k, v in context.items()])
110
+ return f"{user_input}\n\nContext:\n{context_str}"
111
+
112
+ async def _execute_with_retry(self, enhanced_input: str) -> Any:
113
+ """Execute agent with retry logic on timeout."""
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  for attempt in range(self.config.retry_attempts):
115
  try:
116
  result = await asyncio.wait_for(
 
120
  ),
121
  timeout=self.config.timeout,
122
  )
123
+ return result
124
  except asyncio.TimeoutError:
125
  if attempt < self.config.retry_attempts - 1:
126
  logger.warning(
127
+ f"Agent {self.config.name} timed out "
128
+ f"(attempt {attempt + 1}/{self.config.retry_attempts}), retrying..."
129
  )
130
  continue
131
+ logger.error(
132
+ f"Agent {self.config.name} timed out after {self.config.retry_attempts} attempts"
133
+ )
134
+ raise
135
+ raise RuntimeError("Retry loop exited without result")
136
+
137
+ def _extract_and_track_usage(self, result: Any) -> Dict[str, Any]:
138
+ """Extract usage info from result and track it."""
139
+ total_usage = {
140
+ "input_tokens": 0,
141
+ "output_tokens": 0,
142
+ "total_tokens": 0,
143
+ "requests": 0,
144
+ }
145
 
146
+ if hasattr(result, "raw_responses") and result.raw_responses:
147
+ for response in result.raw_responses:
148
+ if hasattr(response, "usage") and response.usage:
149
+ total_usage["input_tokens"] += response.usage.input_tokens
150
+ total_usage["output_tokens"] += response.usage.output_tokens
151
+ total_usage["total_tokens"] += response.usage.total_tokens
152
+ total_usage["requests"] += response.usage.requests
153
+
154
+ track_usage_from_agents_sdk(total_usage, self.config.model)
155
+ logger.info(f"Agent usage: {total_usage}")
156
+
157
+ return total_usage
158
+
159
+ def _extract_output(self, result: Any) -> Any:
160
+ """Extract final output from agent result."""
161
+ if not (hasattr(result, "new_items") and result.new_items):
162
+ return str(result)
163
+
164
+ from agents.items import ItemHelpers
165
+
166
+ text_output = ItemHelpers.text_message_outputs(result.new_items)
167
+
168
+ if self.config.output_type and self.config.output_type is not str:
169
  logger.info(
170
+ f"Structured output: {type(text_output)} -> {self.config.output_type}"
171
  )
172
 
173
+ return text_output
174
+
175
+ async def execute(
176
+ self, user_input: str, context: Optional[Dict[str, Any]] = None
177
+ ) -> tuple[Any, Dict[str, Any]]:
178
+ """Execute the agent with user input and optional context."""
179
+ if not self.agent:
180
+ await self.initialize()
181
+
182
+ if self.agent is None:
183
+ raise ValueError("Agent not initialized")
184
+
185
+ enhanced_input = self._enhance_input_with_context(user_input, context)
186
+
187
+ logger.info(f"Executing agent: {self.config.name}")
188
+ logger.info(f"Input: {enhanced_input[:200]}...")
189
+
190
+ import time
191
+
192
+ start_time = time.time()
193
+
194
+ try:
195
+ result = await self._execute_with_retry(enhanced_input)
196
+ execution_time = time.time() - start_time
197
+ logger.info(f"Agent {self.config.name} executed in {execution_time:.2f}s")
198
+
199
+ total_usage = self._extract_and_track_usage(result)
200
+ output = self._extract_output(result)
201
+
202
+ return output, total_usage
 
 
 
 
 
 
 
 
203
 
204
  except asyncio.TimeoutError:
205
  logger.error(
ankigen_core/agents/generators.py CHANGED
@@ -67,10 +67,8 @@ class SubjectExpertAgent(BaseAgentWrapper):
67
  "subject_expert configuration not found - agent system not properly initialized"
68
  )
69
 
70
- # Enable structured output for card generation
71
  base_config.output_type = CardsGenerationSchema
72
 
73
- # Customize instructions for the specific subject
74
  if subject != "general" and base_config.custom_prompts:
75
  subject_prompt = base_config.custom_prompts.get(subject.lower(), "")
76
  if subject_prompt:
@@ -81,102 +79,114 @@ class SubjectExpertAgent(BaseAgentWrapper):
81
  super().__init__(base_config, openai_client)
82
  self.subject = subject
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  async def generate_cards(
85
  self, topic: str, num_cards: int = 5, context: Optional[Dict[str, Any]] = None
86
  ) -> List[Card]:
87
- """Generate flashcards for a given topic with automatic batching for large requests"""
88
- try:
89
- # Use batching for large numbers of cards to avoid LLM limitations
90
- batch_size = 10 # Generate max 10 cards per batch
91
- all_cards = []
92
- total_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
93
-
94
- cards_remaining = num_cards
95
- batch_num = 1
96
 
97
- logger.info(
98
- f"Generating {num_cards} cards for topic '{topic}' using {((num_cards - 1) // batch_size) + 1} batches"
99
- )
100
 
101
- # Track card topics from previous batches to avoid duplication
102
- previous_card_topics = []
 
103
 
 
104
  while cards_remaining > 0:
105
- cards_in_this_batch = min(batch_size, cards_remaining)
 
106
 
107
- logger.info(
108
- f"Generating batch {batch_num}: {cards_in_this_batch} cards"
109
- )
110
-
111
- # Initialize agent only once - Runner.run() creates fresh context each time
112
- # No conversation history accumulation across batches (significant performance gain)
113
  if not self.agent:
114
  await self.initialize()
115
 
116
- user_input = (
117
- f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
118
  )
119
-
120
- # Add cloze generation instruction if enabled
121
- if context and context.get("generate_cloze"):
122
- user_input += "\n\nIMPORTANT: Generate a mix of card types including cloze cards. For code examples, syntax, and fill-in-the-blank concepts, use cloze cards (card_type='cloze'). Aim for roughly 50% cloze cards when dealing with technical/programming content."
123
-
124
- if context:
125
- user_input += f"\n\nAdditional context: {context}"
126
-
127
- # Add previous topics to avoid repetition instead of full conversation history
128
- if previous_card_topics:
129
- topics_summary = ", ".join(
130
- previous_card_topics[-20:]
131
- ) # Last 20 topics to keep it manageable
132
- user_input += f"\n\nAvoid creating cards about these already covered topics: {topics_summary}"
133
-
134
- if batch_num > 1:
135
- user_input += f"\n\nThis is batch {batch_num} of cards. Ensure these cards cover different aspects of the topic."
136
-
137
  response, usage = await self.execute(user_input, context)
138
 
139
- # Accumulate usage information
140
- if usage:
141
- for key in total_usage:
142
- total_usage[key] += usage.get(key, 0)
143
-
144
  batch_cards = self._parse_cards_response(response, topic)
145
  all_cards.extend(batch_cards)
146
 
147
- # Extract topics from generated cards to avoid duplication in next batch
148
- for card in batch_cards:
149
- if hasattr(card, "front") and card.front and card.front.question:
150
- # Extract key terms from the question for deduplication
151
- question_words = card.front.question.lower().split()
152
- key_terms = [word for word in question_words if len(word) > 3][
153
- :3
154
- ] # First 3 meaningful words
155
- if key_terms:
156
- previous_card_topics.append(" ".join(key_terms))
157
-
158
  cards_remaining -= len(batch_cards)
159
- batch_num += 1
160
 
161
  logger.info(
162
- f"Batch {batch_num - 1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
163
  )
164
 
165
- # Safety check to prevent infinite loops
166
  if len(batch_cards) == 0:
167
- logger.warning(
168
- f"No cards generated in batch {batch_num - 1}, stopping generation"
169
- )
170
  break
171
 
172
- # Log final usage information
 
173
  if total_usage.get("total_tokens", 0) > 0:
174
  logger.info(
175
- f"💰 Total Token Usage: {total_usage['total_tokens']} tokens (Input: {total_usage['input_tokens']}, Output: {total_usage['output_tokens']})"
 
176
  )
177
 
178
  logger.info(
179
- f"Generated {len(all_cards)} cards total across {batch_num - 1} batches for topic '{topic}'"
180
  )
181
  return all_cards
182
 
 
67
  "subject_expert configuration not found - agent system not properly initialized"
68
  )
69
 
 
70
  base_config.output_type = CardsGenerationSchema
71
 
 
72
  if subject != "general" and base_config.custom_prompts:
73
  subject_prompt = base_config.custom_prompts.get(subject.lower(), "")
74
  if subject_prompt:
 
79
  super().__init__(base_config, openai_client)
80
  self.subject = subject
81
 
82
+ def _build_batch_prompt(
83
+ self,
84
+ topic: str,
85
+ cards_in_batch: int,
86
+ batch_num: int,
87
+ context: Optional[Dict[str, Any]],
88
+ previous_topics: List[str],
89
+ ) -> str:
90
+ """Build user input prompt for a batch of cards."""
91
+ user_input = f"Generate {cards_in_batch} flashcards for the topic: {topic}"
92
+
93
+ if context and context.get("generate_cloze"):
94
+ user_input += (
95
+ "\n\nIMPORTANT: Generate a mix of card types including cloze cards. "
96
+ "For code examples, syntax, and fill-in-the-blank concepts, use cloze cards "
97
+ "(card_type='cloze'). Aim for roughly 50% cloze cards when dealing with technical/programming content."
98
+ )
99
+
100
+ if context:
101
+ user_input += f"\n\nAdditional context: {context}"
102
+
103
+ if previous_topics:
104
+ topics_summary = ", ".join(previous_topics[-20:])
105
+ user_input += f"\n\nAvoid creating cards about these already covered topics: {topics_summary}"
106
+
107
+ if batch_num > 1:
108
+ user_input += f"\n\nThis is batch {batch_num} of cards. Ensure these cards cover different aspects of the topic."
109
+
110
+ return user_input
111
+
112
+ def _extract_topics_for_dedup(self, batch_cards: List[Card]) -> List[str]:
113
+ """Extract key terms from card questions for deduplication."""
114
+ topics = []
115
+ for card in batch_cards:
116
+ if hasattr(card, "front") and card.front and card.front.question:
117
+ question_words = card.front.question.lower().split()
118
+ key_terms = [word for word in question_words if len(word) > 3][:3]
119
+ if key_terms:
120
+ topics.append(" ".join(key_terms))
121
+ return topics
122
+
123
+ def _accumulate_usage(
124
+ self, total_usage: Dict[str, int], batch_usage: Optional[Dict[str, Any]]
125
+ ) -> None:
126
+ """Accumulate batch usage into total usage."""
127
+ if batch_usage:
128
+ for key in total_usage:
129
+ total_usage[key] += batch_usage.get(key, 0)
130
+
131
  async def generate_cards(
132
  self, topic: str, num_cards: int = 5, context: Optional[Dict[str, Any]] = None
133
  ) -> List[Card]:
134
+ """Generate flashcards for a given topic with automatic batching."""
135
+ batch_size = 10
136
+ all_cards: List[Card] = []
137
+ total_usage: Dict[str, int] = {
138
+ "total_tokens": 0,
139
+ "input_tokens": 0,
140
+ "output_tokens": 0,
141
+ }
142
+ previous_topics: List[str] = []
143
 
144
+ cards_remaining = num_cards
145
+ batch_num = 1
146
+ num_batches = ((num_cards - 1) // batch_size) + 1
147
 
148
+ logger.info(
149
+ f"Generating {num_cards} cards for '{topic}' using {num_batches} batches"
150
+ )
151
 
152
+ try:
153
  while cards_remaining > 0:
154
+ cards_in_batch = min(batch_size, cards_remaining)
155
+ logger.info(f"Generating batch {batch_num}: {cards_in_batch} cards")
156
 
 
 
 
 
 
 
157
  if not self.agent:
158
  await self.initialize()
159
 
160
+ user_input = self._build_batch_prompt(
161
+ topic, cards_in_batch, batch_num, context, previous_topics
162
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  response, usage = await self.execute(user_input, context)
164
 
165
+ self._accumulate_usage(total_usage, usage)
 
 
 
 
166
  batch_cards = self._parse_cards_response(response, topic)
167
  all_cards.extend(batch_cards)
168
 
169
+ previous_topics.extend(self._extract_topics_for_dedup(batch_cards))
 
 
 
 
 
 
 
 
 
 
170
  cards_remaining -= len(batch_cards)
 
171
 
172
  logger.info(
173
+ f"Batch {batch_num} generated {len(batch_cards)} cards. {cards_remaining} remaining."
174
  )
175
 
 
176
  if len(batch_cards) == 0:
177
+ logger.warning(f"No cards generated in batch {batch_num}, stopping")
 
 
178
  break
179
 
180
+ batch_num += 1
181
+
182
  if total_usage.get("total_tokens", 0) > 0:
183
  logger.info(
184
+ f"Total usage: {total_usage['total_tokens']} tokens "
185
+ f"(Input: {total_usage['input_tokens']}, Output: {total_usage['output_tokens']})"
186
  )
187
 
188
  logger.info(
189
+ f"Generated {len(all_cards)} cards across {batch_num} batches for '{topic}'"
190
  )
191
  return all_cards
192
 
ankigen_core/agents/token_tracker.py CHANGED
@@ -34,6 +34,25 @@ class TokenTracker:
34
  def count_tokens_for_messages(
35
  self, messages: List[Dict[str, str]], model: str
36
  ) -> int:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
  encoding = tiktoken.encoding_for_model(model)
39
  except KeyError:
@@ -61,11 +80,6 @@ class TokenTracker:
61
 
62
  return len(encoding.encode(text))
63
 
64
- def estimate_cost(
65
- self, prompt_tokens: int, completion_tokens: int, model: str
66
- ) -> Optional[float]:
67
- return None
68
-
69
  def track_usage_from_response(
70
  self, response_data, model: str
71
  ) -> Optional[TokenUsage]:
@@ -98,10 +112,7 @@ class TokenTracker:
98
  ) -> TokenUsage:
99
  total_tokens = prompt_tokens + completion_tokens
100
 
101
- if actual_cost is not None:
102
- final_cost = actual_cost
103
- else:
104
- final_cost = self.estimate_cost(prompt_tokens, completion_tokens, model)
105
 
106
  usage = TokenUsage(
107
  prompt_tokens=prompt_tokens,
 
34
  def count_tokens_for_messages(
35
  self, messages: List[Dict[str, str]], model: str
36
  ) -> int:
37
+ """
38
+ Count total tokens for a list of chat messages using tiktoken.
39
+
40
+ Implements OpenAI's token counting algorithm for chat completions:
41
+ - Each message adds 3 tokens for role/content/structure overhead
42
+ - Message names add an additional token
43
+ - The entire message list adds 3 tokens for conversation wrapper
44
+
45
+ The encoding is selected based on the model:
46
+ - Attempts to use model-specific encoding via tiktoken
47
+ - Falls back to 'o200k_base' (GPT-4 Turbo encoding) for unknown models
48
+
49
+ Args:
50
+ messages: List of message dicts (each with 'role', 'content', optional 'name')
51
+ model: OpenAI model identifier (e.g., 'gpt-4.1', 'gpt-4o')
52
+
53
+ Returns:
54
+ Total tokens required to send these messages to the model
55
+ """
56
  try:
57
  encoding = tiktoken.encoding_for_model(model)
58
  except KeyError:
 
80
 
81
  return len(encoding.encode(text))
82
 
 
 
 
 
 
83
  def track_usage_from_response(
84
  self, response_data, model: str
85
  ) -> Optional[TokenUsage]:
 
112
  ) -> TokenUsage:
113
  total_tokens = prompt_tokens + completion_tokens
114
 
115
+ final_cost = actual_cost # Cost estimation removed - rely on API-provided costs
 
 
 
116
 
117
  usage = TokenUsage(
118
  prompt_tokens=prompt_tokens,
ankigen_core/card_generator.py CHANGED
@@ -70,10 +70,58 @@ GENERATION_MODES = [
70
  # Legacy functions removed - all card generation now handled by agent system
71
 
72
 
73
- async def orchestrate_card_generation( # MODIFIED: Added async
74
- client_manager: OpenAIClientManager, # Expect the manager
75
- cache: ResponseCache, # Expect the cache instance
76
- # --- UI Inputs --- (These will be passed from app.py handler)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  api_key_input: str,
78
  subject: str,
79
  generation_mode: str,
@@ -89,109 +137,66 @@ async def orchestrate_card_generation( # MODIFIED: Added async
89
  library_topic: str = None,
90
  ):
91
  """Orchestrates the card generation process based on UI inputs."""
92
-
93
  logger.info(f"Starting card generation orchestration in {generation_mode} mode")
94
  logger.debug(
95
- f"Parameters: mode={generation_mode}, topics={topic_number}, cards_per_topic={cards_per_topic}, cloze={generate_cloze}"
 
96
  )
97
 
98
- # --- AGENT SYSTEM INTEGRATION ---
99
- if AGENTS_AVAILABLE:
100
- logger.info("🤖 Using agent system for card generation")
101
- try:
102
- from ankigen_core.agents.token_tracker import get_token_tracker
103
-
104
- token_tracker = get_token_tracker()
105
-
106
- orchestrator = AgentOrchestrator(client_manager)
107
-
108
- logger.info(f"Using {model_name} for SubjectExpertAgent")
109
- await orchestrator.initialize(api_key_input, {"subject_expert": model_name})
110
-
111
- # Map generation mode to subject
112
- agent_subject = "general"
113
- if generation_mode == "subject":
114
- agent_subject = subject if subject else "general"
115
- elif generation_mode == "path":
116
- agent_subject = "curriculum_design"
117
- elif generation_mode == "text":
118
- agent_subject = "content_analysis"
119
-
120
- total_cards_needed = topic_number * cards_per_topic
121
-
122
- context = {}
123
- if generation_mode == "text" and source_text:
124
- context["source_text"] = source_text
125
-
126
- agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
127
- topic=subject if subject else "Mixed Topics",
128
- subject=agent_subject,
129
- num_cards=total_cards_needed,
130
- difficulty="intermediate",
131
- context=context,
132
- library_name=library_name,
133
- library_topic=library_topic,
134
- generate_cloze=generate_cloze,
135
- )
136
 
137
- # Get token usage from session
138
- try:
139
- # Try both method names for compatibility
140
- if hasattr(token_tracker, "get_session_summary"):
141
- token_usage = token_tracker.get_session_summary()
142
- elif hasattr(token_tracker, "get_session_usage"):
143
- token_usage = token_tracker.get_session_usage()
144
- else:
145
- raise AttributeError("TokenTracker has no session summary method")
146
-
147
- token_usage_html = f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>"
148
- except Exception as e:
149
- logger.error(f"Token usage collection failed: {e}")
150
- token_usage_html = "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"
151
-
152
- # Convert agent cards to dataframe format
153
- if agent_cards:
154
- formatted_cards = format_cards_for_dataframe(
155
- agent_cards,
156
- topic_name=subject if subject else "General",
157
- start_index=1,
158
- )
159
-
160
- output_df = pd.DataFrame(
161
- formatted_cards, columns=get_dataframe_columns()
162
- )
163
- total_cards_message = f"<div><b>Cards Generated:</b> <span id='total-cards-count'>{len(output_df)}</span></div>"
164
-
165
- logger.info(
166
- f"Agent system generated {len(output_df)} cards successfully"
167
- )
168
- return output_df, total_cards_message, token_usage_html
169
- else:
170
- logger.error("Agent system returned no cards")
171
- gr.Error("🤖 Agent system returned no cards")
172
- return (
173
- pd.DataFrame(columns=get_dataframe_columns()),
174
- "Agent system returned no cards.",
175
- "",
176
- )
177
-
178
- except Exception as e:
179
- logger.error(f"Agent system failed: {e}")
180
- gr.Error(f"🤖 Agent system error: {str(e)}")
181
- return (
182
- pd.DataFrame(columns=get_dataframe_columns()),
183
- f"Agent system error: {str(e)}",
184
- "",
185
  )
 
 
 
 
 
 
 
 
 
 
186
 
187
- # Agent system is required and should never fail to be available
188
- logger.error("Agent system failed but is required - this should not happen")
189
- gr.Error("Agent system is required but not available")
190
- return (
191
- pd.DataFrame(columns=get_dataframe_columns()),
192
- "Agent system error",
193
- "",
194
- )
195
 
196
 
197
  # Legacy helper functions removed - all processing now handled by agent system
 
70
  # Legacy functions removed - all card generation now handled by agent system
71
 
72
 
73
+ def _map_generation_mode_to_subject(generation_mode: str, subject: str) -> str:
74
+ """Map UI generation mode to agent subject."""
75
+ if generation_mode == "subject":
76
+ return subject if subject else "general"
77
+ elif generation_mode == "path":
78
+ return "curriculum_design"
79
+ elif generation_mode == "text":
80
+ return "content_analysis"
81
+ return "general"
82
+
83
+
84
+ def _build_generation_context(generation_mode: str, source_text: str) -> Dict[str, Any]:
85
+ """Build context dict for card generation."""
86
+ context: Dict[str, Any] = {}
87
+ if generation_mode == "text" and source_text:
88
+ context["source_text"] = source_text
89
+ return context
90
+
91
+
92
+ def _get_token_usage_html(token_tracker) -> str:
93
+ """Extract token usage and format as HTML."""
94
+ try:
95
+ if hasattr(token_tracker, "get_session_summary"):
96
+ token_usage = token_tracker.get_session_summary()
97
+ elif hasattr(token_tracker, "get_session_usage"):
98
+ token_usage = token_tracker.get_session_usage()
99
+ else:
100
+ raise AttributeError("TokenTracker has no session summary method")
101
+
102
+ return f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>"
103
+ except Exception as e:
104
+ logger.error(f"Token usage collection failed: {e}")
105
+ return "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"
106
+
107
+
108
+ def _format_cards_to_dataframe(
109
+ agent_cards: List[Card], subject: str
110
+ ) -> tuple[pd.DataFrame, str]:
111
+ """Format agent cards to DataFrame and generate message."""
112
+ formatted_cards = format_cards_for_dataframe(
113
+ agent_cards,
114
+ topic_name=subject if subject else "General",
115
+ start_index=1,
116
+ )
117
+ output_df = pd.DataFrame(formatted_cards, columns=get_dataframe_columns())
118
+ total_cards_message = f"<div><b>Cards Generated:</b> <span id='total-cards-count'>{len(output_df)}</span></div>"
119
+ return output_df, total_cards_message
120
+
121
+
122
+ async def orchestrate_card_generation(
123
+ client_manager: OpenAIClientManager,
124
+ cache: ResponseCache,
125
  api_key_input: str,
126
  subject: str,
127
  generation_mode: str,
 
137
  library_topic: str = None,
138
  ):
139
  """Orchestrates the card generation process based on UI inputs."""
 
140
  logger.info(f"Starting card generation orchestration in {generation_mode} mode")
141
  logger.debug(
142
+ f"Parameters: mode={generation_mode}, topics={topic_number}, "
143
+ f"cards_per_topic={cards_per_topic}, cloze={generate_cloze}"
144
  )
145
 
146
+ if not AGENTS_AVAILABLE:
147
+ logger.error("Agent system is required but not available")
148
+ gr.Error("Agent system is required but not available")
149
+ return pd.DataFrame(columns=get_dataframe_columns()), "Agent system error", ""
150
+
151
+ try:
152
+ from ankigen_core.agents.token_tracker import get_token_tracker
153
+
154
+ token_tracker = get_token_tracker()
155
+ orchestrator = AgentOrchestrator(client_manager)
156
+
157
+ logger.info(f"Using {model_name} for SubjectExpertAgent")
158
+ await orchestrator.initialize(api_key_input, {"subject_expert": model_name})
159
+
160
+ agent_subject = _map_generation_mode_to_subject(generation_mode, subject)
161
+ context = _build_generation_context(generation_mode, source_text)
162
+ total_cards_needed = topic_number * cards_per_topic
163
+
164
+ agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
165
+ topic=subject if subject else "Mixed Topics",
166
+ subject=agent_subject,
167
+ num_cards=total_cards_needed,
168
+ difficulty="intermediate",
169
+ context=context,
170
+ library_name=library_name,
171
+ library_topic=library_topic,
172
+ generate_cloze=generate_cloze,
173
+ )
174
+
175
+ token_usage_html = _get_token_usage_html(token_tracker)
 
 
 
 
 
 
 
 
176
 
177
+ if agent_cards:
178
+ output_df, total_cards_message = _format_cards_to_dataframe(
179
+ agent_cards, subject
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  )
181
+ logger.info(f"Agent system generated {len(output_df)} cards successfully")
182
+ return output_df, total_cards_message, token_usage_html
183
+
184
+ logger.error("Agent system returned no cards")
185
+ gr.Error("Agent system returned no cards")
186
+ return (
187
+ pd.DataFrame(columns=get_dataframe_columns()),
188
+ "Agent system returned no cards.",
189
+ "",
190
+ )
191
 
192
+ except Exception as e:
193
+ logger.error(f"Agent system failed: {e}")
194
+ gr.Error(f"Agent system error: {str(e)}")
195
+ return (
196
+ pd.DataFrame(columns=get_dataframe_columns()),
197
+ f"Agent system error: {str(e)}",
198
+ "",
199
+ )
200
 
201
 
202
  # Legacy helper functions removed - all processing now handled by agent system
ankigen_core/context7.py CHANGED
@@ -123,6 +123,129 @@ class Context7Client:
123
  logger.error(f"Error calling Context7 tool {tool_name}: {e}")
124
  return {"error": str(e), "success": False}
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  async def resolve_library_id(self, library_name: str) -> Optional[str]:
127
  """Resolve a library name to a Context7-compatible ID"""
128
  logger.info(f"Resolving library ID for: {library_name}")
@@ -131,115 +254,19 @@ class Context7Client:
131
  "resolve-library-id", {"libraryName": library_name}
132
  )
133
 
134
- if result and result.get("success") and result.get("text"):
135
- text = result["text"]
136
-
137
- # Parse the structured response format
138
- libraries = []
139
- lines = text.split("\n")
140
-
141
- current_lib = {}
142
- for line in lines:
143
- line = line.strip()
144
-
145
- # Parse title
146
- if line.startswith("- Title:"):
147
- if current_lib and current_lib.get("id"):
148
- libraries.append(current_lib)
149
- current_lib = {
150
- "title": line.replace("- Title:", "").strip().lower()
151
- }
152
-
153
- # Parse library ID
154
- elif line.startswith("- Context7-compatible library ID:"):
155
- lib_id = line.replace(
156
- "- Context7-compatible library ID:", ""
157
- ).strip()
158
- if current_lib is not None:
159
- current_lib["id"] = lib_id
160
-
161
- # Parse code snippets count
162
- elif line.startswith("- Code Snippets:"):
163
- snippets_str = line.replace("- Code Snippets:", "").strip()
164
- try:
165
- snippets = int(snippets_str)
166
- if current_lib is not None:
167
- current_lib["snippets"] = snippets
168
- except ValueError:
169
- pass
170
-
171
- # Parse trust score
172
- elif line.startswith("- Trust Score:"):
173
- score_str = line.replace("- Trust Score:", "").strip()
174
- try:
175
- trust = float(score_str)
176
- if current_lib is not None:
177
- current_lib["trust"] = trust
178
- except ValueError:
179
- pass
180
-
181
- # Add the last library if exists
182
- if current_lib and current_lib.get("id"):
183
- libraries.append(current_lib)
184
-
185
- # If we found libraries, pick the best match
186
- if libraries:
187
- search_term = library_name.lower()
188
-
189
- # Score each library
190
- best_lib = None
191
- best_score = -1
192
-
193
- for lib in libraries:
194
- score = 0
195
- lib_title = lib.get("title", "")
196
- lib_id = lib["id"].lower()
197
-
198
- # Exact title match gets highest priority
199
- if lib_title == search_term:
200
- score += 10000
201
- # Check if it's exactly "pandas" in the path (not geopandas, etc)
202
- elif lib_id == f"/{search_term}-dev/{search_term}":
203
- score += 5000
204
- elif f"/{search_term}/" in lib_id or lib_id.endswith(
205
- f"/{search_term}"
206
- ):
207
- score += 2000
208
- # Partial title match (but penalize if it's a compound like "geopandas")
209
- elif search_term in lib_title:
210
- if lib_title == search_term:
211
- score += 1000
212
- elif lib_title.startswith(search_term):
213
- score += 200
214
- else:
215
- score += 50
216
-
217
- # Strong bonus for code snippets (indicates main library)
218
- snippets = lib.get("snippets", 0)
219
- score += snippets / 10 # Pandas has 7386 snippets
220
-
221
- # Significant bonus for trust score (high trust = official/authoritative)
222
- trust = lib.get("trust", 0)
223
- score += trust * 100 # Trust 9.2 = 920 points, Trust 7 = 700 points
224
-
225
- # Debug logging
226
- if search_term in lib_title or search_term in lib_id:
227
- logger.debug(
228
- f"Scoring {lib['id']}: title='{lib_title}', snippets={snippets}, "
229
- f"trust={trust}, score={score:.2f}"
230
- )
231
-
232
- if score > best_score:
233
- best_score = score
234
- best_lib = lib
235
-
236
- if best_lib:
237
- logger.info(
238
- f"Resolved '{library_name}' to ID: {best_lib['id']} "
239
- f"(title: {best_lib.get('title', 'unknown')}, snippets: {best_lib.get('snippets', 0)}, "
240
- f"trust: {best_lib.get('trust', 0)}, score: {best_score:.2f})"
241
- )
242
- return best_lib["id"]
243
 
244
  logger.warning(f"Could not resolve library ID for '{library_name}'")
245
  return None
 
123
  logger.error(f"Error calling Context7 tool {tool_name}: {e}")
124
  return {"error": str(e), "success": False}
125
 
126
+ def _parse_library_response(self, text: str) -> list[Dict[str, Any]]:
127
+ """Parse Context7 response text into list of library dicts.
128
+
129
+ Args:
130
+ text: Raw text response from Context7
131
+
132
+ Returns:
133
+ List of library dicts with keys: title, id, snippets, trust
134
+ """
135
+ libraries = []
136
+ lines = text.split("\n")
137
+ current_lib: Dict[str, Any] = {}
138
+
139
+ for line in lines:
140
+ line = line.strip()
141
+
142
+ if line.startswith("- Title:"):
143
+ if current_lib and current_lib.get("id"):
144
+ libraries.append(current_lib)
145
+ current_lib = {"title": line.replace("- Title:", "").strip().lower()}
146
+
147
+ elif line.startswith("- Context7-compatible library ID:"):
148
+ lib_id = line.replace("- Context7-compatible library ID:", "").strip()
149
+ if current_lib is not None:
150
+ current_lib["id"] = lib_id
151
+
152
+ elif line.startswith("- Code Snippets:"):
153
+ snippets_str = line.replace("- Code Snippets:", "").strip()
154
+ try:
155
+ if current_lib is not None:
156
+ current_lib["snippets"] = int(snippets_str)
157
+ except ValueError:
158
+ pass
159
+
160
+ elif line.startswith("- Trust Score:"):
161
+ score_str = line.replace("- Trust Score:", "").strip()
162
+ try:
163
+ if current_lib is not None:
164
+ current_lib["trust"] = float(score_str)
165
+ except ValueError:
166
+ pass
167
+
168
+ if current_lib and current_lib.get("id"):
169
+ libraries.append(current_lib)
170
+
171
+ return libraries
172
+
173
+ def _score_library(self, lib: Dict[str, Any], search_term: str) -> float:
174
+ """Score a library based on how well it matches the search term.
175
+
176
+ Args:
177
+ lib: Library dict with title, id, snippets, trust
178
+ search_term: Lowercase search term
179
+
180
+ Returns:
181
+ Score (higher is better match)
182
+ """
183
+ score = 0.0
184
+ lib_title = lib.get("title", "")
185
+ lib_id = lib["id"].lower()
186
+
187
+ # Exact title match gets highest priority
188
+ if lib_title == search_term:
189
+ score += 10000
190
+ elif lib_id == f"/{search_term}-dev/{search_term}":
191
+ score += 5000
192
+ elif f"/{search_term}/" in lib_id or lib_id.endswith(f"/{search_term}"):
193
+ score += 2000
194
+ elif search_term in lib_title:
195
+ if lib_title == search_term:
196
+ score += 1000
197
+ elif lib_title.startswith(search_term):
198
+ score += 200
199
+ else:
200
+ score += 50
201
+
202
+ # Bonus for code snippets (indicates main library)
203
+ snippets = lib.get("snippets", 0)
204
+ score += snippets / 10
205
+
206
+ # Bonus for trust score (high trust = official/authoritative)
207
+ trust = lib.get("trust", 0)
208
+ score += trust * 100
209
+
210
+ return score
211
+
212
+ def _select_best_library(
213
+ self, libraries: list[Dict[str, Any]], search_term: str
214
+ ) -> Optional[Dict[str, Any]]:
215
+ """Select the best matching library from a list.
216
+
217
+ Args:
218
+ libraries: List of library dicts
219
+ search_term: Lowercase search term
220
+
221
+ Returns:
222
+ Best matching library dict, or None if no match
223
+ """
224
+ best_lib = None
225
+ best_score = -1.0
226
+
227
+ for lib in libraries:
228
+ score = self._score_library(lib, search_term)
229
+
230
+ if search_term in lib.get("title", "") or search_term in lib["id"].lower():
231
+ logger.debug(
232
+ f"Scoring {lib['id']}: title='{lib.get('title', '')}', "
233
+ f"snippets={lib.get('snippets', 0)}, trust={lib.get('trust', 0)}, score={score:.2f}"
234
+ )
235
+
236
+ if score > best_score:
237
+ best_score = score
238
+ best_lib = lib
239
+
240
+ if best_lib:
241
+ logger.info(
242
+ f"Selected library: {best_lib['id']} (title: {best_lib.get('title', 'unknown')}, "
243
+ f"snippets: {best_lib.get('snippets', 0)}, trust: {best_lib.get('trust', 0)}, "
244
+ f"score: {best_score:.2f})"
245
+ )
246
+
247
+ return best_lib
248
+
249
  async def resolve_library_id(self, library_name: str) -> Optional[str]:
250
  """Resolve a library name to a Context7-compatible ID"""
251
  logger.info(f"Resolving library ID for: {library_name}")
 
254
  "resolve-library-id", {"libraryName": library_name}
255
  )
256
 
257
+ if not (result and result.get("success") and result.get("text")):
258
+ logger.warning(f"Could not resolve library ID for '{library_name}'")
259
+ return None
260
+
261
+ libraries = self._parse_library_response(result["text"])
262
+ if not libraries:
263
+ logger.warning(f"Could not resolve library ID for '{library_name}'")
264
+ return None
265
+
266
+ best_lib = self._select_best_library(libraries, library_name.lower())
267
+ if best_lib:
268
+ logger.info(f"Resolved '{library_name}' to ID: {best_lib['id']}")
269
+ return best_lib["id"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  logger.warning(f"Could not resolve library ID for '{library_name}'")
272
  return None
ankigen_core/crawler.py CHANGED
@@ -418,119 +418,173 @@ class WebCrawler:
418
 
419
  return False, None
420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  def crawl(
422
  self, progress_callback: Optional[Callable[[int, int, str], None]] = None
423
  ) -> List[CrawledPage]:
424
- # Initialize URLs using helper method
 
 
 
 
 
 
 
425
  urls_to_visit = self._initialize_crawl_queue()
426
  crawled_pages: List[CrawledPage] = []
427
- initial_total_for_progress = len(urls_to_visit)
428
-
429
  processed_count = 0
 
430
  while urls_to_visit:
431
  current_url, current_depth, current_parent_url = urls_to_visit.pop(0)
432
 
433
- current_total_for_progress = (
434
- initial_total_for_progress
435
- if self.use_sitemap
436
- else processed_count + len(urls_to_visit) + 1
 
 
437
  )
438
 
439
- if progress_callback:
440
- progress_callback(
441
- processed_count,
442
- current_total_for_progress,
443
- current_url,
444
- )
445
-
446
- # Check if URL should be skipped using helper method
447
  should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
448
  if should_skip:
449
- if progress_callback and skip_reason:
450
- dynamic_total = (
451
- initial_total_for_progress
452
- if self.use_sitemap
453
- else processed_count + len(urls_to_visit) + 1
 
 
454
  )
455
- progress_callback(processed_count, dynamic_total, skip_reason)
456
  continue
457
 
 
 
 
458
  self.logger.info(
459
- f"Crawling (Depth {current_depth}): {current_url} ({processed_count + 1}/{current_total_for_progress})"
460
  )
461
 
462
- if progress_callback:
463
- progress_callback(
464
- processed_count, current_total_for_progress, current_url
465
- )
466
-
467
  self.visited_urls.add(current_url)
468
-
469
  self.rate_limiter.wait()
470
 
471
  try:
472
- response = self.session.get(current_url, timeout=10)
473
- response.raise_for_status()
474
- html_content = response.text
475
- soup = BeautifulSoup(html_content, "html.parser")
476
-
477
- # Extract metadata using helper method
478
- page_title, meta_description, meta_keywords = (
479
- self._extract_page_metadata(soup, current_url)
480
- )
481
-
482
- text_content = self._extract_text(soup)
483
-
484
- page_data = CrawledPage(
485
- url=current_url,
486
- html_content=html_content,
487
- text_content=text_content,
488
- title=page_title,
489
- meta_description=meta_description,
490
- meta_keywords=meta_keywords,
491
- crawl_depth=current_depth,
492
- parent_url=current_parent_url,
493
  )
494
  crawled_pages.append(page_data)
495
  self.logger.info(f"Successfully processed and stored: {current_url}")
496
 
497
- if current_depth < self.max_depth:
498
- found_links = self._extract_links(soup, current_url)
499
- self.logger.debug(
500
- f"Found {len(found_links)} links on {current_url}"
501
- )
502
- for link in found_links:
503
- if link not in self.visited_urls:
504
- urls_to_visit.append((link, current_depth + 1, current_url))
505
-
506
- except requests.exceptions.HTTPError as e:
507
- self.logger.error(
508
- f"HTTPError for {current_url}: {e.response.status_code} - {e.response.reason}. Response: {e.response.text[:200]}...",
509
- exc_info=False,
510
- )
511
- processed_count += 1
512
- except requests.exceptions.ConnectionError as e:
513
- self.logger.error(
514
- f"ConnectionError for {current_url}: {e}", exc_info=False
515
- )
516
- processed_count += 1
517
- except requests.exceptions.Timeout as e:
518
- self.logger.error(f"Timeout for {current_url}: {e}", exc_info=False)
519
- processed_count += 1
520
- except requests.exceptions.RequestException as e:
521
- self.logger.error(
522
- f"RequestException for {current_url}: {e}", exc_info=True
523
  )
524
- processed_count += 1
525
  except Exception as e:
526
- self.logger.error(
527
- f"An unexpected error occurred while processing {current_url}: {e}",
528
- exc_info=True,
529
- )
530
  processed_count += 1
 
 
 
531
 
532
  self.logger.info(
533
- f"Crawl completed. Total pages processed/attempted: {processed_count}. Successfully crawled pages: {len(crawled_pages)}"
 
534
  )
535
  if progress_callback:
536
  progress_callback(processed_count, processed_count, "Crawling complete.")
 
418
 
419
  return False, None
420
 
421
+ def _calculate_progress_total(
422
+ self, processed_count: int, urls_to_visit_len: int, initial_total: int
423
+ ) -> int:
424
+ """Calculate the total for progress reporting."""
425
+ if self.use_sitemap:
426
+ return initial_total
427
+ return processed_count + urls_to_visit_len + 1
428
+
429
+ def _update_crawl_progress(
430
+ self,
431
+ progress_callback: Optional[Callable[[int, int, str], None]],
432
+ processed_count: int,
433
+ urls_to_visit_len: int,
434
+ initial_total: int,
435
+ message: str,
436
+ ) -> None:
437
+ """Update progress callback if provided."""
438
+ if progress_callback:
439
+ total = self._calculate_progress_total(
440
+ processed_count, urls_to_visit_len, initial_total
441
+ )
442
+ progress_callback(processed_count, total, message)
443
+
444
+ def _fetch_and_parse_url(
445
+ self, url: str, depth: int, parent_url: Optional[str]
446
+ ) -> Tuple[CrawledPage, BeautifulSoup]:
447
+ """Fetch URL and create CrawledPage object.
448
+
449
+ Args:
450
+ url: URL to fetch
451
+ depth: Current crawl depth
452
+ parent_url: URL of the parent page
453
+
454
+ Returns:
455
+ Tuple of (CrawledPage, BeautifulSoup) for further processing
456
+
457
+ Raises:
458
+ requests.RequestException: If the HTTP request fails
459
+ """
460
+ response = self.session.get(url, timeout=10)
461
+ response.raise_for_status()
462
+ html_content = response.text
463
+ soup = BeautifulSoup(html_content, "html.parser")
464
+
465
+ page_title, meta_description, meta_keywords = self._extract_page_metadata(
466
+ soup, url
467
+ )
468
+ text_content = self._extract_text(soup)
469
+
470
+ return CrawledPage(
471
+ url=url,
472
+ html_content=html_content,
473
+ text_content=text_content,
474
+ title=page_title,
475
+ meta_description=meta_description,
476
+ meta_keywords=meta_keywords,
477
+ crawl_depth=depth,
478
+ parent_url=parent_url,
479
+ ), soup
480
+
481
+ def _enqueue_discovered_links(
482
+ self,
483
+ soup: BeautifulSoup,
484
+ current_url: str,
485
+ current_depth: int,
486
+ urls_to_visit: List[Tuple[str, int, Optional[str]]],
487
+ ) -> None:
488
+ """Extract links from page and add unvisited ones to queue."""
489
+ if current_depth >= self.max_depth:
490
+ return
491
+
492
+ found_links = self._extract_links(soup, current_url)
493
+ self.logger.debug(f"Found {len(found_links)} links on {current_url}")
494
+ for link in found_links:
495
+ if link not in self.visited_urls:
496
+ urls_to_visit.append((link, current_depth + 1, current_url))
497
+
498
+ def _handle_crawl_error(self, url: str, error: Exception) -> None:
499
+ """Log crawl error with appropriate detail level."""
500
+ if isinstance(error, requests.exceptions.HTTPError):
501
+ self.logger.error(
502
+ f"HTTPError for {url}: {error.response.status_code} - {error.response.reason}. "
503
+ f"Response: {error.response.text[:200]}...",
504
+ exc_info=False,
505
+ )
506
+ elif isinstance(error, requests.exceptions.ConnectionError):
507
+ self.logger.error(f"ConnectionError for {url}: {error}", exc_info=False)
508
+ elif isinstance(error, requests.exceptions.Timeout):
509
+ self.logger.error(f"Timeout for {url}: {error}", exc_info=False)
510
+ elif isinstance(error, requests.exceptions.RequestException):
511
+ self.logger.error(f"RequestException for {url}: {error}", exc_info=True)
512
+ else:
513
+ self.logger.error(
514
+ f"An unexpected error occurred while processing {url}: {error}",
515
+ exc_info=True,
516
+ )
517
+
518
  def crawl(
519
  self, progress_callback: Optional[Callable[[int, int, str], None]] = None
520
  ) -> List[CrawledPage]:
521
+ """Crawl website starting from the configured URL.
522
+
523
+ Args:
524
+ progress_callback: Optional callback for progress updates (processed, total, message)
525
+
526
+ Returns:
527
+ List of CrawledPage objects for successfully crawled pages
528
+ """
529
  urls_to_visit = self._initialize_crawl_queue()
530
  crawled_pages: List[CrawledPage] = []
531
+ initial_total = len(urls_to_visit)
 
532
  processed_count = 0
533
+
534
  while urls_to_visit:
535
  current_url, current_depth, current_parent_url = urls_to_visit.pop(0)
536
 
537
+ self._update_crawl_progress(
538
+ progress_callback,
539
+ processed_count,
540
+ len(urls_to_visit),
541
+ initial_total,
542
+ current_url,
543
  )
544
 
 
 
 
 
 
 
 
 
545
  should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
546
  if should_skip:
547
+ if skip_reason:
548
+ self._update_crawl_progress(
549
+ progress_callback,
550
+ processed_count,
551
+ len(urls_to_visit),
552
+ initial_total,
553
+ skip_reason,
554
  )
 
555
  continue
556
 
557
+ total = self._calculate_progress_total(
558
+ processed_count, len(urls_to_visit), initial_total
559
+ )
560
  self.logger.info(
561
+ f"Crawling (Depth {current_depth}): {current_url} ({processed_count + 1}/{total})"
562
  )
563
 
 
 
 
 
 
564
  self.visited_urls.add(current_url)
 
565
  self.rate_limiter.wait()
566
 
567
  try:
568
+ page_data, soup = self._fetch_and_parse_url(
569
+ current_url, current_depth, current_parent_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  )
571
  crawled_pages.append(page_data)
572
  self.logger.info(f"Successfully processed and stored: {current_url}")
573
 
574
+ self._enqueue_discovered_links(
575
+ soup, current_url, current_depth, urls_to_visit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  )
577
+
578
  except Exception as e:
579
+ self._handle_crawl_error(current_url, e)
 
 
 
580
  processed_count += 1
581
+ continue
582
+
583
+ processed_count += 1
584
 
585
  self.logger.info(
586
+ f"Crawl completed. Total pages processed/attempted: {processed_count}. "
587
+ f"Successfully crawled pages: {len(crawled_pages)}"
588
  )
589
  if progress_callback:
590
  progress_callback(processed_count, processed_count, "Crawling complete.")
ankigen_core/ui_logic.py CHANGED
@@ -3,7 +3,9 @@
3
  import gradio as gr
4
  import pandas as pd # Needed for use_selected_subjects type hinting
5
  from typing import (
 
6
  List,
 
7
  Tuple,
8
  )
9
  from urllib.parse import urlparse
@@ -12,7 +14,7 @@ from urllib.parse import urlparse
12
  import re # For URL validation and filename sanitization
13
  import asyncio
14
 
15
- from ankigen_core.crawler import WebCrawler
16
  from ankigen_core.llm_interface import (
17
  OpenAIClientManager,
18
  )
@@ -436,6 +438,132 @@ def _basic_sanitize_filename(name: str) -> str:
436
  return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)
437
 
438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  async def crawl_and_generate(
440
  url: str,
441
  max_depth: int,
@@ -453,145 +581,46 @@ async def crawl_and_generate(
453
  status_textbox: gr.Textbox,
454
  ) -> Tuple[str, List[dict], List[Card]]:
455
  """Crawls a website, generates Anki cards, and prepares them for export/display."""
456
- # Initialize crawler_ui_logger if it's meant to be used here, e.g., at the start of the function
457
- # For now, assuming it's available in the scope (e.g., global or passed in if it were a class)
458
- # If it's a module-level logger, it should be fine.
459
-
460
- # Ensure the status_textbox is updated via gr.Info or similar if needed
461
- # as it's a parameter but not directly used for output updates in the provided snippet.
462
- # It might be used by side-effect if gr.Info/gr.Warning updates it globally, or if it's part of `progress`.
463
-
464
- # The `status_textbox` parameter is not directly used to set a value in the return,
465
- # but `gr.Info` might update a default status area, or it's for other UI purposes.
466
-
467
  crawler_ui_logger.info(f"Crawl and generate called for URL: {url}")
468
- if not url or not url.startswith(("http://", "https://")):
469
- gr.Warning("Invalid URL provided. Please enter a valid http/https URL.")
470
  return "Invalid URL", [], []
471
 
472
  try:
473
- urlparse(url)
474
- # domain = parsed_url.netloc # allowed_domains is removed from WebCrawler call
475
- # if not domain:
476
- # gr.Warning("Could not parse domain from URL. Please enter a valid URL.")
477
- # return "Invalid URL (cannot parse domain)", [], []
478
-
479
- include_list = [p.strip() for p in include_patterns.split(",") if p.strip()]
480
- exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]
481
-
482
- # WebCrawler instantiation updated to remove parameters causing issues.
483
- # The WebCrawler will use its defaults or other configured ways for these.
484
- # The 'requests_per_second' from UI maps to 'delay_between_requests' internally if crawler supports it,
485
- # but since 'delay_between_requests' was also flagged, we remove it.
486
- # The WebCrawler class itself needs to be checked for its actual constructor parameters.
487
- crawler = WebCrawler(
488
- start_url=url,
489
- max_depth=max_depth, # Assuming max_depth is still a valid param
490
- # allowed_domains=[domain], # Removed based on linter error
491
- # delay_between_requests=1.0 / crawler_requests_per_second # Removed
492
- # if crawler_requests_per_second > 0
493
- # else 0.1,
494
- # max_pages=500, # Removed
495
- include_patterns=include_list, # Assuming this is valid
496
- exclude_patterns=exclude_list, # Assuming this is valid
497
- use_sitemap=use_sitemap, # Assuming this is valid
498
- sitemap_url=sitemap_url_str
499
- if use_sitemap and sitemap_url_str and sitemap_url_str.strip()
500
- else None,
501
- )
502
-
503
- total_urls_for_progress = 0
504
-
505
- def crawler_progress_callback(
506
- processed_count: int, total_urls: int, current_url_processing: str
507
- ):
508
- nonlocal total_urls_for_progress
509
- total_urls_for_progress = total_urls
510
- if total_urls_for_progress > 0:
511
- progress(
512
- 0.1 + (processed_count / total_urls_for_progress) * 0.4,
513
- desc=f"Crawling: {processed_count}/{total_urls_for_progress} URLs. Current: {current_url_processing}",
514
- )
515
- else:
516
- progress(
517
- 0.1 + processed_count * 0.01,
518
- desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url_processing}",
519
- )
520
-
521
- crawler_ui_logger.info(f"Starting crawl for {url}...")
522
- progress(0.15, desc=f"Starting crawl for {url}...")
523
- crawled_pages = await asyncio.to_thread(
524
- crawler.crawl, progress_callback=crawler_progress_callback
525
  )
526
- crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.")
527
- progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.")
528
 
 
529
  if not crawled_pages:
530
  progress(1.0, desc="No pages were crawled. Check URL and patterns.")
531
- # Return structure: (status_message, df_data, raw_cards_data)
532
  return (
533
  "No pages were crawled. Check URL and patterns.",
534
  pd.DataFrame().to_dict(orient="records"),
535
  [],
536
  )
537
 
538
- # --- AGENT SYSTEM INTEGRATION FOR WEB CRAWLING ---
539
- crawler_ui_logger.info("🤖 Using agent system for web crawling card generation")
540
-
541
- # Initialize agent orchestrator
542
- orchestrator = AgentOrchestrator(client_manager)
543
- await orchestrator.initialize("dummy-key") # Key already in client_manager
544
-
545
- # Combine all crawled content into a single context
546
- combined_content = "\n\n--- PAGE BREAK ---\n\n".join(
547
- [
548
- f"URL: {page.url}\nTitle: {page.title}\nContent: {page.text_content[:2000]}..."
549
- for page in crawled_pages[
550
- :10
551
- ] # Limit to first 10 pages to avoid token limits
552
- ]
553
- )
554
-
555
- context = {
556
- "source_text": combined_content,
557
- "crawl_source": url,
558
- "pages_crawled": len(crawled_pages),
559
- }
560
-
561
- progress(0.6, desc="🤖 Processing with agent system...")
562
-
563
- # Generate cards with agents
564
- agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
565
- topic=f"Content from {url}",
566
- subject="web_content",
567
- num_cards=min(len(crawled_pages) * 3, 50), # 3 cards per page, max 50
568
- difficulty="intermediate",
569
- enable_quality_pipeline=True,
570
- context=context,
571
  )
572
 
573
  if agent_cards:
574
- progress(0.9, desc=f"🤖 Agent system generated {len(agent_cards)} cards")
575
-
576
  cards_for_dataframe_export = generate_cards_from_crawled_content(
577
  agent_cards
578
  )
579
-
580
- final_message = f"🤖 Agent system processed content from {len(crawled_pages)} pages. Generated {len(agent_cards)} high-quality cards."
581
  progress(1.0, desc=final_message)
582
-
583
- return (
584
- final_message,
585
- cards_for_dataframe_export,
586
- agent_cards,
587
- )
588
  else:
589
- progress(1.0, desc="🤖 Agent system returned no cards")
590
- return (
591
- "Agent system returned no cards",
592
- pd.DataFrame().to_dict(orient="records"),
593
- [],
594
- )
595
 
596
  except ConnectionError as e:
597
  crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True)
@@ -618,14 +647,6 @@ async def crawl_and_generate(
618
  [],
619
  )
620
 
621
- final_message = f"Content crawled and processed. {len(cards_for_dataframe_export) if cards_for_dataframe_export else 0} potential cards prepared. Load them into the main table for review and export."
622
- progress(1.0, desc=final_message)
623
- return (
624
- final_message,
625
- cards_for_dataframe_export,
626
- agent_cards,
627
- ) # agent_cards is List[Card]
628
-
629
 
630
  # --- Card Preview and Editing Utilities (Task 13.3) ---
631
 
 
3
  import gradio as gr
4
  import pandas as pd # Needed for use_selected_subjects type hinting
5
  from typing import (
6
+ Callable,
7
  List,
8
+ Optional,
9
  Tuple,
10
  )
11
  from urllib.parse import urlparse
 
14
  import re # For URL validation and filename sanitization
15
  import asyncio
16
 
17
+ from ankigen_core.crawler import CrawledPage, WebCrawler
18
  from ankigen_core.llm_interface import (
19
  OpenAIClientManager,
20
  )
 
438
  return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)
439
 
440
 
441
+ def _validate_crawl_url(url: str) -> bool:
442
+ """Validate URL for crawling."""
443
+ if not url or not url.startswith(("http://", "https://")):
444
+ gr.Warning("Invalid URL provided. Please enter a valid http/https URL.")
445
+ return False
446
+ try:
447
+ urlparse(url)
448
+ return True
449
+ except Exception:
450
+ return False
451
+
452
+
453
+ def _create_web_crawler(
454
+ url: str,
455
+ max_depth: int,
456
+ include_patterns: str,
457
+ exclude_patterns: str,
458
+ use_sitemap: bool,
459
+ sitemap_url_str: str,
460
+ ) -> WebCrawler:
461
+ """Create configured WebCrawler instance."""
462
+ include_list = [p.strip() for p in include_patterns.split(",") if p.strip()]
463
+ exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]
464
+
465
+ return WebCrawler(
466
+ start_url=url,
467
+ max_depth=max_depth,
468
+ include_patterns=include_list,
469
+ exclude_patterns=exclude_list,
470
+ use_sitemap=use_sitemap,
471
+ sitemap_url=sitemap_url_str
472
+ if use_sitemap and sitemap_url_str.strip()
473
+ else None,
474
+ )
475
+
476
+
477
+ def _create_crawl_progress_callback(
478
+ progress: gr.Progress,
479
+ ) -> Tuple[Callable[[int, int, str], None], List[int]]:
480
+ """Create progress callback for crawler with mutable state container."""
481
+ total_urls_container = [0] # Mutable container for nonlocal-like behavior
482
+
483
+ def callback(processed_count: int, total_urls: int, current_url: str):
484
+ total_urls_container[0] = total_urls
485
+ if total_urls_container[0] > 0:
486
+ progress(
487
+ 0.1 + (processed_count / total_urls_container[0]) * 0.4,
488
+ desc=f"Crawling: {processed_count}/{total_urls_container[0]} URLs. Current: {current_url}",
489
+ )
490
+ else:
491
+ progress(
492
+ 0.1 + processed_count * 0.01,
493
+ desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url}",
494
+ )
495
+
496
+ return callback, total_urls_container
497
+
498
+
499
+ async def _perform_web_crawl(
500
+ crawler: WebCrawler,
501
+ progress: gr.Progress,
502
+ url: str,
503
+ ) -> Optional[List[CrawledPage]]:
504
+ """Execute web crawl and return pages or None if empty."""
505
+ callback, _ = _create_crawl_progress_callback(progress)
506
+
507
+ crawler_ui_logger.info(f"Starting crawl for {url}...")
508
+ progress(0.15, desc=f"Starting crawl for {url}...")
509
+
510
+ crawled_pages = await asyncio.to_thread(crawler.crawl, progress_callback=callback)
511
+
512
+ crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.")
513
+ progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.")
514
+
515
+ return crawled_pages if crawled_pages else None
516
+
517
+
518
+ async def _process_crawled_with_agents(
519
+ crawled_pages: List[CrawledPage],
520
+ client_manager: OpenAIClientManager,
521
+ url: str,
522
+ progress: gr.Progress,
523
+ ) -> Tuple[List[Card], str]:
524
+ """Process crawled content with agent system."""
525
+ crawler_ui_logger.info("Using agent system for web crawling card generation")
526
+
527
+ orchestrator = AgentOrchestrator(client_manager)
528
+ # API key is already configured in client_manager, pass empty string as placeholder
529
+ await orchestrator.initialize("")
530
+
531
+ combined_content = "\n\n--- PAGE BREAK ---\n\n".join(
532
+ [
533
+ f"URL: {page.url}\nTitle: {page.title}\nContent: {page.text_content[:2000]}..."
534
+ for page in crawled_pages[:10]
535
+ ]
536
+ )
537
+
538
+ context = {
539
+ "source_text": combined_content,
540
+ "crawl_source": url,
541
+ "pages_crawled": len(crawled_pages),
542
+ }
543
+
544
+ progress(0.6, desc="Processing with agent system...")
545
+
546
+ agent_cards, _ = await orchestrator.generate_cards_with_agents(
547
+ topic=f"Content from {url}",
548
+ subject="web_content",
549
+ num_cards=min(len(crawled_pages) * 3, 50),
550
+ difficulty="intermediate",
551
+ enable_quality_pipeline=True,
552
+ context=context,
553
+ )
554
+
555
+ if agent_cards:
556
+ progress(0.9, desc=f"Agent system generated {len(agent_cards)} cards")
557
+ final_message = (
558
+ f"Agent system processed content from {len(crawled_pages)} pages. "
559
+ f"Generated {len(agent_cards)} high-quality cards."
560
+ )
561
+ else:
562
+ final_message = "Agent system returned no cards"
563
+
564
+ return agent_cards or [], final_message
565
+
566
+
567
  async def crawl_and_generate(
568
  url: str,
569
  max_depth: int,
 
581
  status_textbox: gr.Textbox,
582
  ) -> Tuple[str, List[dict], List[Card]]:
583
  """Crawls a website, generates Anki cards, and prepares them for export/display."""
 
 
 
 
 
 
 
 
 
 
 
584
  crawler_ui_logger.info(f"Crawl and generate called for URL: {url}")
585
+
586
+ if not _validate_crawl_url(url):
587
  return "Invalid URL", [], []
588
 
589
  try:
590
+ crawler = _create_web_crawler(
591
+ url,
592
+ max_depth,
593
+ include_patterns,
594
+ exclude_patterns,
595
+ use_sitemap,
596
+ sitemap_url_str,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  )
 
 
598
 
599
+ crawled_pages = await _perform_web_crawl(crawler, progress, url)
600
  if not crawled_pages:
601
  progress(1.0, desc="No pages were crawled. Check URL and patterns.")
 
602
  return (
603
  "No pages were crawled. Check URL and patterns.",
604
  pd.DataFrame().to_dict(orient="records"),
605
  [],
606
  )
607
 
608
+ agent_cards, final_message = await _process_crawled_with_agents(
609
+ crawled_pages,
610
+ client_manager,
611
+ url,
612
+ progress,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  )
614
 
615
  if agent_cards:
 
 
616
  cards_for_dataframe_export = generate_cards_from_crawled_content(
617
  agent_cards
618
  )
 
 
619
  progress(1.0, desc=final_message)
620
+ return final_message, cards_for_dataframe_export, agent_cards
 
 
 
 
 
621
  else:
622
+ progress(1.0, desc=final_message)
623
+ return final_message, pd.DataFrame().to_dict(orient="records"), []
 
 
 
 
624
 
625
  except ConnectionError as e:
626
  crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True)
 
647
  [],
648
  )
649
 
 
 
 
 
 
 
 
 
650
 
651
  # --- Card Preview and Editing Utilities (Task 13.3) ---
652