Upload folder using huggingface_hub
Browse files- .gitignore +1 -0
- ankigen_core/agents/base.py +82 -72
- ankigen_core/agents/generators.py +78 -68
- ankigen_core/agents/token_tracker.py +20 -9
- ankigen_core/card_generator.py +105 -100
- ankigen_core/context7.py +136 -109
- ankigen_core/crawler.py +135 -81
- ankigen_core/ui_logic.py +147 -126
.gitignore
CHANGED
|
@@ -199,3 +199,4 @@ scripts/
|
|
| 199 |
.taskmasterconfig
|
| 200 |
.cursor
|
| 201 |
.serena/
|
|
|
|
|
|
| 199 |
.taskmasterconfig
|
| 200 |
.cursor
|
| 201 |
.serena/
|
| 202 |
+
.serena/
|
ankigen_core/agents/base.py
CHANGED
|
@@ -100,30 +100,17 @@ class BaseAgentWrapper:
|
|
| 100 |
logger.error(f"Failed to initialize agent {self.config.name}: {e}")
|
| 101 |
raise
|
| 102 |
|
| 103 |
-
|
| 104 |
-
self, user_input: str, context: Optional[Dict[str, Any]]
|
| 105 |
-
) ->
|
| 106 |
-
"""
|
| 107 |
-
if
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
enhanced_input = f"{user_input}\n\nContext:\n{context_str}"
|
| 115 |
-
|
| 116 |
-
# Execute the agent using Runner.run() with retry logic
|
| 117 |
-
if self.agent is None:
|
| 118 |
-
raise ValueError("Agent not initialized")
|
| 119 |
-
|
| 120 |
-
logger.info(f"🤖 EXECUTING AGENT: {self.config.name}")
|
| 121 |
-
logger.info(f"📝 INPUT: {enhanced_input[:200]}...")
|
| 122 |
-
|
| 123 |
-
import time
|
| 124 |
-
|
| 125 |
-
start_time = time.time()
|
| 126 |
-
|
| 127 |
for attempt in range(self.config.retry_attempts):
|
| 128 |
try:
|
| 129 |
result = await asyncio.wait_for(
|
|
@@ -133,63 +120,86 @@ class BaseAgentWrapper:
|
|
| 133 |
),
|
| 134 |
timeout=self.config.timeout,
|
| 135 |
)
|
| 136 |
-
|
| 137 |
except asyncio.TimeoutError:
|
| 138 |
if attempt < self.config.retry_attempts - 1:
|
| 139 |
logger.warning(
|
| 140 |
-
f"Agent {self.config.name} timed out
|
|
|
|
| 141 |
)
|
| 142 |
continue
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
logger.info(
|
| 152 |
-
f"
|
| 153 |
)
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
f"✅ STRUCTURED OUTPUT: {type(text_output)} -> {self.config.output_type}"
|
| 186 |
-
)
|
| 187 |
-
# The agents SDK should return the structured object directly
|
| 188 |
-
return text_output, total_usage
|
| 189 |
-
else:
|
| 190 |
-
return text_output, total_usage
|
| 191 |
-
else:
|
| 192 |
-
return str(result), total_usage
|
| 193 |
|
| 194 |
except asyncio.TimeoutError:
|
| 195 |
logger.error(
|
|
|
|
| 100 |
logger.error(f"Failed to initialize agent {self.config.name}: {e}")
|
| 101 |
raise
|
| 102 |
|
| 103 |
+
def _enhance_input_with_context(
|
| 104 |
+
self, user_input: str, context: Optional[Dict[str, Any]]
|
| 105 |
+
) -> str:
|
| 106 |
+
"""Add context to user input if provided."""
|
| 107 |
+
if context is None:
|
| 108 |
+
return user_input
|
| 109 |
+
context_str = "\n".join([f"{k}: {v}" for k, v in context.items()])
|
| 110 |
+
return f"{user_input}\n\nContext:\n{context_str}"
|
| 111 |
+
|
| 112 |
+
async def _execute_with_retry(self, enhanced_input: str) -> Any:
|
| 113 |
+
"""Execute agent with retry logic on timeout."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
for attempt in range(self.config.retry_attempts):
|
| 115 |
try:
|
| 116 |
result = await asyncio.wait_for(
|
|
|
|
| 120 |
),
|
| 121 |
timeout=self.config.timeout,
|
| 122 |
)
|
| 123 |
+
return result
|
| 124 |
except asyncio.TimeoutError:
|
| 125 |
if attempt < self.config.retry_attempts - 1:
|
| 126 |
logger.warning(
|
| 127 |
+
f"Agent {self.config.name} timed out "
|
| 128 |
+
f"(attempt {attempt + 1}/{self.config.retry_attempts}), retrying..."
|
| 129 |
)
|
| 130 |
continue
|
| 131 |
+
logger.error(
|
| 132 |
+
f"Agent {self.config.name} timed out after {self.config.retry_attempts} attempts"
|
| 133 |
+
)
|
| 134 |
+
raise
|
| 135 |
+
raise RuntimeError("Retry loop exited without result")
|
| 136 |
+
|
| 137 |
+
def _extract_and_track_usage(self, result: Any) -> Dict[str, Any]:
|
| 138 |
+
"""Extract usage info from result and track it."""
|
| 139 |
+
total_usage = {
|
| 140 |
+
"input_tokens": 0,
|
| 141 |
+
"output_tokens": 0,
|
| 142 |
+
"total_tokens": 0,
|
| 143 |
+
"requests": 0,
|
| 144 |
+
}
|
| 145 |
|
| 146 |
+
if hasattr(result, "raw_responses") and result.raw_responses:
|
| 147 |
+
for response in result.raw_responses:
|
| 148 |
+
if hasattr(response, "usage") and response.usage:
|
| 149 |
+
total_usage["input_tokens"] += response.usage.input_tokens
|
| 150 |
+
total_usage["output_tokens"] += response.usage.output_tokens
|
| 151 |
+
total_usage["total_tokens"] += response.usage.total_tokens
|
| 152 |
+
total_usage["requests"] += response.usage.requests
|
| 153 |
+
|
| 154 |
+
track_usage_from_agents_sdk(total_usage, self.config.model)
|
| 155 |
+
logger.info(f"Agent usage: {total_usage}")
|
| 156 |
+
|
| 157 |
+
return total_usage
|
| 158 |
+
|
| 159 |
+
def _extract_output(self, result: Any) -> Any:
|
| 160 |
+
"""Extract final output from agent result."""
|
| 161 |
+
if not (hasattr(result, "new_items") and result.new_items):
|
| 162 |
+
return str(result)
|
| 163 |
+
|
| 164 |
+
from agents.items import ItemHelpers
|
| 165 |
+
|
| 166 |
+
text_output = ItemHelpers.text_message_outputs(result.new_items)
|
| 167 |
+
|
| 168 |
+
if self.config.output_type and self.config.output_type is not str:
|
| 169 |
logger.info(
|
| 170 |
+
f"Structured output: {type(text_output)} -> {self.config.output_type}"
|
| 171 |
)
|
| 172 |
|
| 173 |
+
return text_output
|
| 174 |
+
|
| 175 |
+
async def execute(
|
| 176 |
+
self, user_input: str, context: Optional[Dict[str, Any]] = None
|
| 177 |
+
) -> tuple[Any, Dict[str, Any]]:
|
| 178 |
+
"""Execute the agent with user input and optional context."""
|
| 179 |
+
if not self.agent:
|
| 180 |
+
await self.initialize()
|
| 181 |
+
|
| 182 |
+
if self.agent is None:
|
| 183 |
+
raise ValueError("Agent not initialized")
|
| 184 |
+
|
| 185 |
+
enhanced_input = self._enhance_input_with_context(user_input, context)
|
| 186 |
+
|
| 187 |
+
logger.info(f"Executing agent: {self.config.name}")
|
| 188 |
+
logger.info(f"Input: {enhanced_input[:200]}...")
|
| 189 |
+
|
| 190 |
+
import time
|
| 191 |
+
|
| 192 |
+
start_time = time.time()
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
result = await self._execute_with_retry(enhanced_input)
|
| 196 |
+
execution_time = time.time() - start_time
|
| 197 |
+
logger.info(f"Agent {self.config.name} executed in {execution_time:.2f}s")
|
| 198 |
+
|
| 199 |
+
total_usage = self._extract_and_track_usage(result)
|
| 200 |
+
output = self._extract_output(result)
|
| 201 |
+
|
| 202 |
+
return output, total_usage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
except asyncio.TimeoutError:
|
| 205 |
logger.error(
|
ankigen_core/agents/generators.py
CHANGED
|
@@ -67,10 +67,8 @@ class SubjectExpertAgent(BaseAgentWrapper):
|
|
| 67 |
"subject_expert configuration not found - agent system not properly initialized"
|
| 68 |
)
|
| 69 |
|
| 70 |
-
# Enable structured output for card generation
|
| 71 |
base_config.output_type = CardsGenerationSchema
|
| 72 |
|
| 73 |
-
# Customize instructions for the specific subject
|
| 74 |
if subject != "general" and base_config.custom_prompts:
|
| 75 |
subject_prompt = base_config.custom_prompts.get(subject.lower(), "")
|
| 76 |
if subject_prompt:
|
|
@@ -81,102 +79,114 @@ class SubjectExpertAgent(BaseAgentWrapper):
|
|
| 81 |
super().__init__(base_config, openai_client)
|
| 82 |
self.subject = subject
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
async def generate_cards(
|
| 85 |
self, topic: str, num_cards: int = 5, context: Optional[Dict[str, Any]] = None
|
| 86 |
) -> List[Card]:
|
| 87 |
-
"""Generate flashcards for a given topic with automatic batching
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
|
|
|
| 103 |
|
|
|
|
| 104 |
while cards_remaining > 0:
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
-
logger.info(
|
| 108 |
-
f"Generating batch {batch_num}: {cards_in_this_batch} cards"
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
# Initialize agent only once - Runner.run() creates fresh context each time
|
| 112 |
-
# No conversation history accumulation across batches (significant performance gain)
|
| 113 |
if not self.agent:
|
| 114 |
await self.initialize()
|
| 115 |
|
| 116 |
-
user_input = (
|
| 117 |
-
|
| 118 |
)
|
| 119 |
-
|
| 120 |
-
# Add cloze generation instruction if enabled
|
| 121 |
-
if context and context.get("generate_cloze"):
|
| 122 |
-
user_input += "\n\nIMPORTANT: Generate a mix of card types including cloze cards. For code examples, syntax, and fill-in-the-blank concepts, use cloze cards (card_type='cloze'). Aim for roughly 50% cloze cards when dealing with technical/programming content."
|
| 123 |
-
|
| 124 |
-
if context:
|
| 125 |
-
user_input += f"\n\nAdditional context: {context}"
|
| 126 |
-
|
| 127 |
-
# Add previous topics to avoid repetition instead of full conversation history
|
| 128 |
-
if previous_card_topics:
|
| 129 |
-
topics_summary = ", ".join(
|
| 130 |
-
previous_card_topics[-20:]
|
| 131 |
-
) # Last 20 topics to keep it manageable
|
| 132 |
-
user_input += f"\n\nAvoid creating cards about these already covered topics: {topics_summary}"
|
| 133 |
-
|
| 134 |
-
if batch_num > 1:
|
| 135 |
-
user_input += f"\n\nThis is batch {batch_num} of cards. Ensure these cards cover different aspects of the topic."
|
| 136 |
-
|
| 137 |
response, usage = await self.execute(user_input, context)
|
| 138 |
|
| 139 |
-
|
| 140 |
-
if usage:
|
| 141 |
-
for key in total_usage:
|
| 142 |
-
total_usage[key] += usage.get(key, 0)
|
| 143 |
-
|
| 144 |
batch_cards = self._parse_cards_response(response, topic)
|
| 145 |
all_cards.extend(batch_cards)
|
| 146 |
|
| 147 |
-
|
| 148 |
-
for card in batch_cards:
|
| 149 |
-
if hasattr(card, "front") and card.front and card.front.question:
|
| 150 |
-
# Extract key terms from the question for deduplication
|
| 151 |
-
question_words = card.front.question.lower().split()
|
| 152 |
-
key_terms = [word for word in question_words if len(word) > 3][
|
| 153 |
-
:3
|
| 154 |
-
] # First 3 meaningful words
|
| 155 |
-
if key_terms:
|
| 156 |
-
previous_card_topics.append(" ".join(key_terms))
|
| 157 |
-
|
| 158 |
cards_remaining -= len(batch_cards)
|
| 159 |
-
batch_num += 1
|
| 160 |
|
| 161 |
logger.info(
|
| 162 |
-
f"Batch {batch_num
|
| 163 |
)
|
| 164 |
|
| 165 |
-
# Safety check to prevent infinite loops
|
| 166 |
if len(batch_cards) == 0:
|
| 167 |
-
logger.warning(
|
| 168 |
-
f"No cards generated in batch {batch_num - 1}, stopping generation"
|
| 169 |
-
)
|
| 170 |
break
|
| 171 |
|
| 172 |
-
|
|
|
|
| 173 |
if total_usage.get("total_tokens", 0) > 0:
|
| 174 |
logger.info(
|
| 175 |
-
f"
|
|
|
|
| 176 |
)
|
| 177 |
|
| 178 |
logger.info(
|
| 179 |
-
f"
|
| 180 |
)
|
| 181 |
return all_cards
|
| 182 |
|
|
|
|
| 67 |
"subject_expert configuration not found - agent system not properly initialized"
|
| 68 |
)
|
| 69 |
|
|
|
|
| 70 |
base_config.output_type = CardsGenerationSchema
|
| 71 |
|
|
|
|
| 72 |
if subject != "general" and base_config.custom_prompts:
|
| 73 |
subject_prompt = base_config.custom_prompts.get(subject.lower(), "")
|
| 74 |
if subject_prompt:
|
|
|
|
| 79 |
super().__init__(base_config, openai_client)
|
| 80 |
self.subject = subject
|
| 81 |
|
| 82 |
+
def _build_batch_prompt(
|
| 83 |
+
self,
|
| 84 |
+
topic: str,
|
| 85 |
+
cards_in_batch: int,
|
| 86 |
+
batch_num: int,
|
| 87 |
+
context: Optional[Dict[str, Any]],
|
| 88 |
+
previous_topics: List[str],
|
| 89 |
+
) -> str:
|
| 90 |
+
"""Build user input prompt for a batch of cards."""
|
| 91 |
+
user_input = f"Generate {cards_in_batch} flashcards for the topic: {topic}"
|
| 92 |
+
|
| 93 |
+
if context and context.get("generate_cloze"):
|
| 94 |
+
user_input += (
|
| 95 |
+
"\n\nIMPORTANT: Generate a mix of card types including cloze cards. "
|
| 96 |
+
"For code examples, syntax, and fill-in-the-blank concepts, use cloze cards "
|
| 97 |
+
"(card_type='cloze'). Aim for roughly 50% cloze cards when dealing with technical/programming content."
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
if context:
|
| 101 |
+
user_input += f"\n\nAdditional context: {context}"
|
| 102 |
+
|
| 103 |
+
if previous_topics:
|
| 104 |
+
topics_summary = ", ".join(previous_topics[-20:])
|
| 105 |
+
user_input += f"\n\nAvoid creating cards about these already covered topics: {topics_summary}"
|
| 106 |
+
|
| 107 |
+
if batch_num > 1:
|
| 108 |
+
user_input += f"\n\nThis is batch {batch_num} of cards. Ensure these cards cover different aspects of the topic."
|
| 109 |
+
|
| 110 |
+
return user_input
|
| 111 |
+
|
| 112 |
+
def _extract_topics_for_dedup(self, batch_cards: List[Card]) -> List[str]:
|
| 113 |
+
"""Extract key terms from card questions for deduplication."""
|
| 114 |
+
topics = []
|
| 115 |
+
for card in batch_cards:
|
| 116 |
+
if hasattr(card, "front") and card.front and card.front.question:
|
| 117 |
+
question_words = card.front.question.lower().split()
|
| 118 |
+
key_terms = [word for word in question_words if len(word) > 3][:3]
|
| 119 |
+
if key_terms:
|
| 120 |
+
topics.append(" ".join(key_terms))
|
| 121 |
+
return topics
|
| 122 |
+
|
| 123 |
+
def _accumulate_usage(
|
| 124 |
+
self, total_usage: Dict[str, int], batch_usage: Optional[Dict[str, Any]]
|
| 125 |
+
) -> None:
|
| 126 |
+
"""Accumulate batch usage into total usage."""
|
| 127 |
+
if batch_usage:
|
| 128 |
+
for key in total_usage:
|
| 129 |
+
total_usage[key] += batch_usage.get(key, 0)
|
| 130 |
+
|
| 131 |
async def generate_cards(
|
| 132 |
self, topic: str, num_cards: int = 5, context: Optional[Dict[str, Any]] = None
|
| 133 |
) -> List[Card]:
|
| 134 |
+
"""Generate flashcards for a given topic with automatic batching."""
|
| 135 |
+
batch_size = 10
|
| 136 |
+
all_cards: List[Card] = []
|
| 137 |
+
total_usage: Dict[str, int] = {
|
| 138 |
+
"total_tokens": 0,
|
| 139 |
+
"input_tokens": 0,
|
| 140 |
+
"output_tokens": 0,
|
| 141 |
+
}
|
| 142 |
+
previous_topics: List[str] = []
|
| 143 |
|
| 144 |
+
cards_remaining = num_cards
|
| 145 |
+
batch_num = 1
|
| 146 |
+
num_batches = ((num_cards - 1) // batch_size) + 1
|
| 147 |
|
| 148 |
+
logger.info(
|
| 149 |
+
f"Generating {num_cards} cards for '{topic}' using {num_batches} batches"
|
| 150 |
+
)
|
| 151 |
|
| 152 |
+
try:
|
| 153 |
while cards_remaining > 0:
|
| 154 |
+
cards_in_batch = min(batch_size, cards_remaining)
|
| 155 |
+
logger.info(f"Generating batch {batch_num}: {cards_in_batch} cards")
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
if not self.agent:
|
| 158 |
await self.initialize()
|
| 159 |
|
| 160 |
+
user_input = self._build_batch_prompt(
|
| 161 |
+
topic, cards_in_batch, batch_num, context, previous_topics
|
| 162 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
response, usage = await self.execute(user_input, context)
|
| 164 |
|
| 165 |
+
self._accumulate_usage(total_usage, usage)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
batch_cards = self._parse_cards_response(response, topic)
|
| 167 |
all_cards.extend(batch_cards)
|
| 168 |
|
| 169 |
+
previous_topics.extend(self._extract_topics_for_dedup(batch_cards))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
cards_remaining -= len(batch_cards)
|
|
|
|
| 171 |
|
| 172 |
logger.info(
|
| 173 |
+
f"Batch {batch_num} generated {len(batch_cards)} cards. {cards_remaining} remaining."
|
| 174 |
)
|
| 175 |
|
|
|
|
| 176 |
if len(batch_cards) == 0:
|
| 177 |
+
logger.warning(f"No cards generated in batch {batch_num}, stopping")
|
|
|
|
|
|
|
| 178 |
break
|
| 179 |
|
| 180 |
+
batch_num += 1
|
| 181 |
+
|
| 182 |
if total_usage.get("total_tokens", 0) > 0:
|
| 183 |
logger.info(
|
| 184 |
+
f"Total usage: {total_usage['total_tokens']} tokens "
|
| 185 |
+
f"(Input: {total_usage['input_tokens']}, Output: {total_usage['output_tokens']})"
|
| 186 |
)
|
| 187 |
|
| 188 |
logger.info(
|
| 189 |
+
f"Generated {len(all_cards)} cards across {batch_num} batches for '{topic}'"
|
| 190 |
)
|
| 191 |
return all_cards
|
| 192 |
|
ankigen_core/agents/token_tracker.py
CHANGED
|
@@ -34,6 +34,25 @@ class TokenTracker:
|
|
| 34 |
def count_tokens_for_messages(
|
| 35 |
self, messages: List[Dict[str, str]], model: str
|
| 36 |
) -> int:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
try:
|
| 38 |
encoding = tiktoken.encoding_for_model(model)
|
| 39 |
except KeyError:
|
|
@@ -61,11 +80,6 @@ class TokenTracker:
|
|
| 61 |
|
| 62 |
return len(encoding.encode(text))
|
| 63 |
|
| 64 |
-
def estimate_cost(
|
| 65 |
-
self, prompt_tokens: int, completion_tokens: int, model: str
|
| 66 |
-
) -> Optional[float]:
|
| 67 |
-
return None
|
| 68 |
-
|
| 69 |
def track_usage_from_response(
|
| 70 |
self, response_data, model: str
|
| 71 |
) -> Optional[TokenUsage]:
|
|
@@ -98,10 +112,7 @@ class TokenTracker:
|
|
| 98 |
) -> TokenUsage:
|
| 99 |
total_tokens = prompt_tokens + completion_tokens
|
| 100 |
|
| 101 |
-
|
| 102 |
-
final_cost = actual_cost
|
| 103 |
-
else:
|
| 104 |
-
final_cost = self.estimate_cost(prompt_tokens, completion_tokens, model)
|
| 105 |
|
| 106 |
usage = TokenUsage(
|
| 107 |
prompt_tokens=prompt_tokens,
|
|
|
|
| 34 |
def count_tokens_for_messages(
|
| 35 |
self, messages: List[Dict[str, str]], model: str
|
| 36 |
) -> int:
|
| 37 |
+
"""
|
| 38 |
+
Count total tokens for a list of chat messages using tiktoken.
|
| 39 |
+
|
| 40 |
+
Implements OpenAI's token counting algorithm for chat completions:
|
| 41 |
+
- Each message adds 3 tokens for role/content/structure overhead
|
| 42 |
+
- Message names add an additional token
|
| 43 |
+
- The entire message list adds 3 tokens for conversation wrapper
|
| 44 |
+
|
| 45 |
+
The encoding is selected based on the model:
|
| 46 |
+
- Attempts to use model-specific encoding via tiktoken
|
| 47 |
+
- Falls back to 'o200k_base' (GPT-4 Turbo encoding) for unknown models
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
messages: List of message dicts (each with 'role', 'content', optional 'name')
|
| 51 |
+
model: OpenAI model identifier (e.g., 'gpt-4.1', 'gpt-4o')
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
Total tokens required to send these messages to the model
|
| 55 |
+
"""
|
| 56 |
try:
|
| 57 |
encoding = tiktoken.encoding_for_model(model)
|
| 58 |
except KeyError:
|
|
|
|
| 80 |
|
| 81 |
return len(encoding.encode(text))
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
def track_usage_from_response(
|
| 84 |
self, response_data, model: str
|
| 85 |
) -> Optional[TokenUsage]:
|
|
|
|
| 112 |
) -> TokenUsage:
|
| 113 |
total_tokens = prompt_tokens + completion_tokens
|
| 114 |
|
| 115 |
+
final_cost = actual_cost # Cost estimation removed - rely on API-provided costs
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
usage = TokenUsage(
|
| 118 |
prompt_tokens=prompt_tokens,
|
ankigen_core/card_generator.py
CHANGED
|
@@ -70,10 +70,58 @@ GENERATION_MODES = [
|
|
| 70 |
# Legacy functions removed - all card generation now handled by agent system
|
| 71 |
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
api_key_input: str,
|
| 78 |
subject: str,
|
| 79 |
generation_mode: str,
|
|
@@ -89,109 +137,66 @@ async def orchestrate_card_generation( # MODIFIED: Added async
|
|
| 89 |
library_topic: str = None,
|
| 90 |
):
|
| 91 |
"""Orchestrates the card generation process based on UI inputs."""
|
| 92 |
-
|
| 93 |
logger.info(f"Starting card generation orchestration in {generation_mode} mode")
|
| 94 |
logger.debug(
|
| 95 |
-
f"Parameters: mode={generation_mode}, topics={topic_number},
|
|
|
|
| 96 |
)
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
subject=agent_subject,
|
| 129 |
-
num_cards=total_cards_needed,
|
| 130 |
-
difficulty="intermediate",
|
| 131 |
-
context=context,
|
| 132 |
-
library_name=library_name,
|
| 133 |
-
library_topic=library_topic,
|
| 134 |
-
generate_cloze=generate_cloze,
|
| 135 |
-
)
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
if hasattr(token_tracker, "get_session_summary"):
|
| 141 |
-
token_usage = token_tracker.get_session_summary()
|
| 142 |
-
elif hasattr(token_tracker, "get_session_usage"):
|
| 143 |
-
token_usage = token_tracker.get_session_usage()
|
| 144 |
-
else:
|
| 145 |
-
raise AttributeError("TokenTracker has no session summary method")
|
| 146 |
-
|
| 147 |
-
token_usage_html = f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>"
|
| 148 |
-
except Exception as e:
|
| 149 |
-
logger.error(f"Token usage collection failed: {e}")
|
| 150 |
-
token_usage_html = "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"
|
| 151 |
-
|
| 152 |
-
# Convert agent cards to dataframe format
|
| 153 |
-
if agent_cards:
|
| 154 |
-
formatted_cards = format_cards_for_dataframe(
|
| 155 |
-
agent_cards,
|
| 156 |
-
topic_name=subject if subject else "General",
|
| 157 |
-
start_index=1,
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
output_df = pd.DataFrame(
|
| 161 |
-
formatted_cards, columns=get_dataframe_columns()
|
| 162 |
-
)
|
| 163 |
-
total_cards_message = f"<div><b>Cards Generated:</b> <span id='total-cards-count'>{len(output_df)}</span></div>"
|
| 164 |
-
|
| 165 |
-
logger.info(
|
| 166 |
-
f"Agent system generated {len(output_df)} cards successfully"
|
| 167 |
-
)
|
| 168 |
-
return output_df, total_cards_message, token_usage_html
|
| 169 |
-
else:
|
| 170 |
-
logger.error("Agent system returned no cards")
|
| 171 |
-
gr.Error("🤖 Agent system returned no cards")
|
| 172 |
-
return (
|
| 173 |
-
pd.DataFrame(columns=get_dataframe_columns()),
|
| 174 |
-
"Agent system returned no cards.",
|
| 175 |
-
"",
|
| 176 |
-
)
|
| 177 |
-
|
| 178 |
-
except Exception as e:
|
| 179 |
-
logger.error(f"Agent system failed: {e}")
|
| 180 |
-
gr.Error(f"🤖 Agent system error: {str(e)}")
|
| 181 |
-
return (
|
| 182 |
-
pd.DataFrame(columns=get_dataframe_columns()),
|
| 183 |
-
f"Agent system error: {str(e)}",
|
| 184 |
-
"",
|
| 185 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
|
| 196 |
|
| 197 |
# Legacy helper functions removed - all processing now handled by agent system
|
|
|
|
| 70 |
# Legacy functions removed - all card generation now handled by agent system
|
| 71 |
|
| 72 |
|
| 73 |
+
def _map_generation_mode_to_subject(generation_mode: str, subject: str) -> str:
|
| 74 |
+
"""Map UI generation mode to agent subject."""
|
| 75 |
+
if generation_mode == "subject":
|
| 76 |
+
return subject if subject else "general"
|
| 77 |
+
elif generation_mode == "path":
|
| 78 |
+
return "curriculum_design"
|
| 79 |
+
elif generation_mode == "text":
|
| 80 |
+
return "content_analysis"
|
| 81 |
+
return "general"
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _build_generation_context(generation_mode: str, source_text: str) -> Dict[str, Any]:
|
| 85 |
+
"""Build context dict for card generation."""
|
| 86 |
+
context: Dict[str, Any] = {}
|
| 87 |
+
if generation_mode == "text" and source_text:
|
| 88 |
+
context["source_text"] = source_text
|
| 89 |
+
return context
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _get_token_usage_html(token_tracker) -> str:
|
| 93 |
+
"""Extract token usage and format as HTML."""
|
| 94 |
+
try:
|
| 95 |
+
if hasattr(token_tracker, "get_session_summary"):
|
| 96 |
+
token_usage = token_tracker.get_session_summary()
|
| 97 |
+
elif hasattr(token_tracker, "get_session_usage"):
|
| 98 |
+
token_usage = token_tracker.get_session_usage()
|
| 99 |
+
else:
|
| 100 |
+
raise AttributeError("TokenTracker has no session summary method")
|
| 101 |
+
|
| 102 |
+
return f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>"
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.error(f"Token usage collection failed: {e}")
|
| 105 |
+
return "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _format_cards_to_dataframe(
|
| 109 |
+
agent_cards: List[Card], subject: str
|
| 110 |
+
) -> tuple[pd.DataFrame, str]:
|
| 111 |
+
"""Format agent cards to DataFrame and generate message."""
|
| 112 |
+
formatted_cards = format_cards_for_dataframe(
|
| 113 |
+
agent_cards,
|
| 114 |
+
topic_name=subject if subject else "General",
|
| 115 |
+
start_index=1,
|
| 116 |
+
)
|
| 117 |
+
output_df = pd.DataFrame(formatted_cards, columns=get_dataframe_columns())
|
| 118 |
+
total_cards_message = f"<div><b>Cards Generated:</b> <span id='total-cards-count'>{len(output_df)}</span></div>"
|
| 119 |
+
return output_df, total_cards_message
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
async def orchestrate_card_generation(
|
| 123 |
+
client_manager: OpenAIClientManager,
|
| 124 |
+
cache: ResponseCache,
|
| 125 |
api_key_input: str,
|
| 126 |
subject: str,
|
| 127 |
generation_mode: str,
|
|
|
|
| 137 |
library_topic: str = None,
|
| 138 |
):
|
| 139 |
"""Orchestrates the card generation process based on UI inputs."""
|
|
|
|
| 140 |
logger.info(f"Starting card generation orchestration in {generation_mode} mode")
|
| 141 |
logger.debug(
|
| 142 |
+
f"Parameters: mode={generation_mode}, topics={topic_number}, "
|
| 143 |
+
f"cards_per_topic={cards_per_topic}, cloze={generate_cloze}"
|
| 144 |
)
|
| 145 |
|
| 146 |
+
if not AGENTS_AVAILABLE:
|
| 147 |
+
logger.error("Agent system is required but not available")
|
| 148 |
+
gr.Error("Agent system is required but not available")
|
| 149 |
+
return pd.DataFrame(columns=get_dataframe_columns()), "Agent system error", ""
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
from ankigen_core.agents.token_tracker import get_token_tracker
|
| 153 |
+
|
| 154 |
+
token_tracker = get_token_tracker()
|
| 155 |
+
orchestrator = AgentOrchestrator(client_manager)
|
| 156 |
+
|
| 157 |
+
logger.info(f"Using {model_name} for SubjectExpertAgent")
|
| 158 |
+
await orchestrator.initialize(api_key_input, {"subject_expert": model_name})
|
| 159 |
+
|
| 160 |
+
agent_subject = _map_generation_mode_to_subject(generation_mode, subject)
|
| 161 |
+
context = _build_generation_context(generation_mode, source_text)
|
| 162 |
+
total_cards_needed = topic_number * cards_per_topic
|
| 163 |
+
|
| 164 |
+
agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
|
| 165 |
+
topic=subject if subject else "Mixed Topics",
|
| 166 |
+
subject=agent_subject,
|
| 167 |
+
num_cards=total_cards_needed,
|
| 168 |
+
difficulty="intermediate",
|
| 169 |
+
context=context,
|
| 170 |
+
library_name=library_name,
|
| 171 |
+
library_topic=library_topic,
|
| 172 |
+
generate_cloze=generate_cloze,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
token_usage_html = _get_token_usage_html(token_tracker)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
if agent_cards:
|
| 178 |
+
output_df, total_cards_message = _format_cards_to_dataframe(
|
| 179 |
+
agent_cards, subject
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
)
|
| 181 |
+
logger.info(f"Agent system generated {len(output_df)} cards successfully")
|
| 182 |
+
return output_df, total_cards_message, token_usage_html
|
| 183 |
+
|
| 184 |
+
logger.error("Agent system returned no cards")
|
| 185 |
+
gr.Error("Agent system returned no cards")
|
| 186 |
+
return (
|
| 187 |
+
pd.DataFrame(columns=get_dataframe_columns()),
|
| 188 |
+
"Agent system returned no cards.",
|
| 189 |
+
"",
|
| 190 |
+
)
|
| 191 |
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logger.error(f"Agent system failed: {e}")
|
| 194 |
+
gr.Error(f"Agent system error: {str(e)}")
|
| 195 |
+
return (
|
| 196 |
+
pd.DataFrame(columns=get_dataframe_columns()),
|
| 197 |
+
f"Agent system error: {str(e)}",
|
| 198 |
+
"",
|
| 199 |
+
)
|
| 200 |
|
| 201 |
|
| 202 |
# Legacy helper functions removed - all processing now handled by agent system
|
ankigen_core/context7.py
CHANGED
|
@@ -123,6 +123,129 @@ class Context7Client:
|
|
| 123 |
logger.error(f"Error calling Context7 tool {tool_name}: {e}")
|
| 124 |
return {"error": str(e), "success": False}
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
async def resolve_library_id(self, library_name: str) -> Optional[str]:
|
| 127 |
"""Resolve a library name to a Context7-compatible ID"""
|
| 128 |
logger.info(f"Resolving library ID for: {library_name}")
|
|
@@ -131,115 +254,19 @@ class Context7Client:
|
|
| 131 |
"resolve-library-id", {"libraryName": library_name}
|
| 132 |
)
|
| 133 |
|
| 134 |
-
if result and result.get("success") and result.get("text"):
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
if current_lib and current_lib.get("id"):
|
| 148 |
-
libraries.append(current_lib)
|
| 149 |
-
current_lib = {
|
| 150 |
-
"title": line.replace("- Title:", "").strip().lower()
|
| 151 |
-
}
|
| 152 |
-
|
| 153 |
-
# Parse library ID
|
| 154 |
-
elif line.startswith("- Context7-compatible library ID:"):
|
| 155 |
-
lib_id = line.replace(
|
| 156 |
-
"- Context7-compatible library ID:", ""
|
| 157 |
-
).strip()
|
| 158 |
-
if current_lib is not None:
|
| 159 |
-
current_lib["id"] = lib_id
|
| 160 |
-
|
| 161 |
-
# Parse code snippets count
|
| 162 |
-
elif line.startswith("- Code Snippets:"):
|
| 163 |
-
snippets_str = line.replace("- Code Snippets:", "").strip()
|
| 164 |
-
try:
|
| 165 |
-
snippets = int(snippets_str)
|
| 166 |
-
if current_lib is not None:
|
| 167 |
-
current_lib["snippets"] = snippets
|
| 168 |
-
except ValueError:
|
| 169 |
-
pass
|
| 170 |
-
|
| 171 |
-
# Parse trust score
|
| 172 |
-
elif line.startswith("- Trust Score:"):
|
| 173 |
-
score_str = line.replace("- Trust Score:", "").strip()
|
| 174 |
-
try:
|
| 175 |
-
trust = float(score_str)
|
| 176 |
-
if current_lib is not None:
|
| 177 |
-
current_lib["trust"] = trust
|
| 178 |
-
except ValueError:
|
| 179 |
-
pass
|
| 180 |
-
|
| 181 |
-
# Add the last library if exists
|
| 182 |
-
if current_lib and current_lib.get("id"):
|
| 183 |
-
libraries.append(current_lib)
|
| 184 |
-
|
| 185 |
-
# If we found libraries, pick the best match
|
| 186 |
-
if libraries:
|
| 187 |
-
search_term = library_name.lower()
|
| 188 |
-
|
| 189 |
-
# Score each library
|
| 190 |
-
best_lib = None
|
| 191 |
-
best_score = -1
|
| 192 |
-
|
| 193 |
-
for lib in libraries:
|
| 194 |
-
score = 0
|
| 195 |
-
lib_title = lib.get("title", "")
|
| 196 |
-
lib_id = lib["id"].lower()
|
| 197 |
-
|
| 198 |
-
# Exact title match gets highest priority
|
| 199 |
-
if lib_title == search_term:
|
| 200 |
-
score += 10000
|
| 201 |
-
# Check if it's exactly "pandas" in the path (not geopandas, etc)
|
| 202 |
-
elif lib_id == f"/{search_term}-dev/{search_term}":
|
| 203 |
-
score += 5000
|
| 204 |
-
elif f"/{search_term}/" in lib_id or lib_id.endswith(
|
| 205 |
-
f"/{search_term}"
|
| 206 |
-
):
|
| 207 |
-
score += 2000
|
| 208 |
-
# Partial title match (but penalize if it's a compound like "geopandas")
|
| 209 |
-
elif search_term in lib_title:
|
| 210 |
-
if lib_title == search_term:
|
| 211 |
-
score += 1000
|
| 212 |
-
elif lib_title.startswith(search_term):
|
| 213 |
-
score += 200
|
| 214 |
-
else:
|
| 215 |
-
score += 50
|
| 216 |
-
|
| 217 |
-
# Strong bonus for code snippets (indicates main library)
|
| 218 |
-
snippets = lib.get("snippets", 0)
|
| 219 |
-
score += snippets / 10 # Pandas has 7386 snippets
|
| 220 |
-
|
| 221 |
-
# Significant bonus for trust score (high trust = official/authoritative)
|
| 222 |
-
trust = lib.get("trust", 0)
|
| 223 |
-
score += trust * 100 # Trust 9.2 = 920 points, Trust 7 = 700 points
|
| 224 |
-
|
| 225 |
-
# Debug logging
|
| 226 |
-
if search_term in lib_title or search_term in lib_id:
|
| 227 |
-
logger.debug(
|
| 228 |
-
f"Scoring {lib['id']}: title='{lib_title}', snippets={snippets}, "
|
| 229 |
-
f"trust={trust}, score={score:.2f}"
|
| 230 |
-
)
|
| 231 |
-
|
| 232 |
-
if score > best_score:
|
| 233 |
-
best_score = score
|
| 234 |
-
best_lib = lib
|
| 235 |
-
|
| 236 |
-
if best_lib:
|
| 237 |
-
logger.info(
|
| 238 |
-
f"Resolved '{library_name}' to ID: {best_lib['id']} "
|
| 239 |
-
f"(title: {best_lib.get('title', 'unknown')}, snippets: {best_lib.get('snippets', 0)}, "
|
| 240 |
-
f"trust: {best_lib.get('trust', 0)}, score: {best_score:.2f})"
|
| 241 |
-
)
|
| 242 |
-
return best_lib["id"]
|
| 243 |
|
| 244 |
logger.warning(f"Could not resolve library ID for '{library_name}'")
|
| 245 |
return None
|
|
|
|
| 123 |
logger.error(f"Error calling Context7 tool {tool_name}: {e}")
|
| 124 |
return {"error": str(e), "success": False}
|
| 125 |
|
| 126 |
+
def _parse_library_response(self, text: str) -> list[Dict[str, Any]]:
|
| 127 |
+
"""Parse Context7 response text into list of library dicts.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
text: Raw text response from Context7
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
List of library dicts with keys: title, id, snippets, trust
|
| 134 |
+
"""
|
| 135 |
+
libraries = []
|
| 136 |
+
lines = text.split("\n")
|
| 137 |
+
current_lib: Dict[str, Any] = {}
|
| 138 |
+
|
| 139 |
+
for line in lines:
|
| 140 |
+
line = line.strip()
|
| 141 |
+
|
| 142 |
+
if line.startswith("- Title:"):
|
| 143 |
+
if current_lib and current_lib.get("id"):
|
| 144 |
+
libraries.append(current_lib)
|
| 145 |
+
current_lib = {"title": line.replace("- Title:", "").strip().lower()}
|
| 146 |
+
|
| 147 |
+
elif line.startswith("- Context7-compatible library ID:"):
|
| 148 |
+
lib_id = line.replace("- Context7-compatible library ID:", "").strip()
|
| 149 |
+
if current_lib is not None:
|
| 150 |
+
current_lib["id"] = lib_id
|
| 151 |
+
|
| 152 |
+
elif line.startswith("- Code Snippets:"):
|
| 153 |
+
snippets_str = line.replace("- Code Snippets:", "").strip()
|
| 154 |
+
try:
|
| 155 |
+
if current_lib is not None:
|
| 156 |
+
current_lib["snippets"] = int(snippets_str)
|
| 157 |
+
except ValueError:
|
| 158 |
+
pass
|
| 159 |
+
|
| 160 |
+
elif line.startswith("- Trust Score:"):
|
| 161 |
+
score_str = line.replace("- Trust Score:", "").strip()
|
| 162 |
+
try:
|
| 163 |
+
if current_lib is not None:
|
| 164 |
+
current_lib["trust"] = float(score_str)
|
| 165 |
+
except ValueError:
|
| 166 |
+
pass
|
| 167 |
+
|
| 168 |
+
if current_lib and current_lib.get("id"):
|
| 169 |
+
libraries.append(current_lib)
|
| 170 |
+
|
| 171 |
+
return libraries
|
| 172 |
+
|
| 173 |
+
def _score_library(self, lib: Dict[str, Any], search_term: str) -> float:
|
| 174 |
+
"""Score a library based on how well it matches the search term.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
lib: Library dict with title, id, snippets, trust
|
| 178 |
+
search_term: Lowercase search term
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
Score (higher is better match)
|
| 182 |
+
"""
|
| 183 |
+
score = 0.0
|
| 184 |
+
lib_title = lib.get("title", "")
|
| 185 |
+
lib_id = lib["id"].lower()
|
| 186 |
+
|
| 187 |
+
# Exact title match gets highest priority
|
| 188 |
+
if lib_title == search_term:
|
| 189 |
+
score += 10000
|
| 190 |
+
elif lib_id == f"/{search_term}-dev/{search_term}":
|
| 191 |
+
score += 5000
|
| 192 |
+
elif f"/{search_term}/" in lib_id or lib_id.endswith(f"/{search_term}"):
|
| 193 |
+
score += 2000
|
| 194 |
+
elif search_term in lib_title:
|
| 195 |
+
if lib_title == search_term:
|
| 196 |
+
score += 1000
|
| 197 |
+
elif lib_title.startswith(search_term):
|
| 198 |
+
score += 200
|
| 199 |
+
else:
|
| 200 |
+
score += 50
|
| 201 |
+
|
| 202 |
+
# Bonus for code snippets (indicates main library)
|
| 203 |
+
snippets = lib.get("snippets", 0)
|
| 204 |
+
score += snippets / 10
|
| 205 |
+
|
| 206 |
+
# Bonus for trust score (high trust = official/authoritative)
|
| 207 |
+
trust = lib.get("trust", 0)
|
| 208 |
+
score += trust * 100
|
| 209 |
+
|
| 210 |
+
return score
|
| 211 |
+
|
| 212 |
+
def _select_best_library(
|
| 213 |
+
self, libraries: list[Dict[str, Any]], search_term: str
|
| 214 |
+
) -> Optional[Dict[str, Any]]:
|
| 215 |
+
"""Select the best matching library from a list.
|
| 216 |
+
|
| 217 |
+
Args:
|
| 218 |
+
libraries: List of library dicts
|
| 219 |
+
search_term: Lowercase search term
|
| 220 |
+
|
| 221 |
+
Returns:
|
| 222 |
+
Best matching library dict, or None if no match
|
| 223 |
+
"""
|
| 224 |
+
best_lib = None
|
| 225 |
+
best_score = -1.0
|
| 226 |
+
|
| 227 |
+
for lib in libraries:
|
| 228 |
+
score = self._score_library(lib, search_term)
|
| 229 |
+
|
| 230 |
+
if search_term in lib.get("title", "") or search_term in lib["id"].lower():
|
| 231 |
+
logger.debug(
|
| 232 |
+
f"Scoring {lib['id']}: title='{lib.get('title', '')}', "
|
| 233 |
+
f"snippets={lib.get('snippets', 0)}, trust={lib.get('trust', 0)}, score={score:.2f}"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
if score > best_score:
|
| 237 |
+
best_score = score
|
| 238 |
+
best_lib = lib
|
| 239 |
+
|
| 240 |
+
if best_lib:
|
| 241 |
+
logger.info(
|
| 242 |
+
f"Selected library: {best_lib['id']} (title: {best_lib.get('title', 'unknown')}, "
|
| 243 |
+
f"snippets: {best_lib.get('snippets', 0)}, trust: {best_lib.get('trust', 0)}, "
|
| 244 |
+
f"score: {best_score:.2f})"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
return best_lib
|
| 248 |
+
|
| 249 |
async def resolve_library_id(self, library_name: str) -> Optional[str]:
|
| 250 |
"""Resolve a library name to a Context7-compatible ID"""
|
| 251 |
logger.info(f"Resolving library ID for: {library_name}")
|
|
|
|
| 254 |
"resolve-library-id", {"libraryName": library_name}
|
| 255 |
)
|
| 256 |
|
| 257 |
+
if not (result and result.get("success") and result.get("text")):
|
| 258 |
+
logger.warning(f"Could not resolve library ID for '{library_name}'")
|
| 259 |
+
return None
|
| 260 |
+
|
| 261 |
+
libraries = self._parse_library_response(result["text"])
|
| 262 |
+
if not libraries:
|
| 263 |
+
logger.warning(f"Could not resolve library ID for '{library_name}'")
|
| 264 |
+
return None
|
| 265 |
+
|
| 266 |
+
best_lib = self._select_best_library(libraries, library_name.lower())
|
| 267 |
+
if best_lib:
|
| 268 |
+
logger.info(f"Resolved '{library_name}' to ID: {best_lib['id']}")
|
| 269 |
+
return best_lib["id"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
logger.warning(f"Could not resolve library ID for '{library_name}'")
|
| 272 |
return None
|
ankigen_core/crawler.py
CHANGED
|
@@ -418,119 +418,173 @@ class WebCrawler:
|
|
| 418 |
|
| 419 |
return False, None
|
| 420 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
def crawl(
|
| 422 |
self, progress_callback: Optional[Callable[[int, int, str], None]] = None
|
| 423 |
) -> List[CrawledPage]:
|
| 424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
urls_to_visit = self._initialize_crawl_queue()
|
| 426 |
crawled_pages: List[CrawledPage] = []
|
| 427 |
-
|
| 428 |
-
|
| 429 |
processed_count = 0
|
|
|
|
| 430 |
while urls_to_visit:
|
| 431 |
current_url, current_depth, current_parent_url = urls_to_visit.pop(0)
|
| 432 |
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
|
|
|
|
|
|
| 437 |
)
|
| 438 |
|
| 439 |
-
if progress_callback:
|
| 440 |
-
progress_callback(
|
| 441 |
-
processed_count,
|
| 442 |
-
current_total_for_progress,
|
| 443 |
-
current_url,
|
| 444 |
-
)
|
| 445 |
-
|
| 446 |
-
# Check if URL should be skipped using helper method
|
| 447 |
should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
|
| 448 |
if should_skip:
|
| 449 |
-
if
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
|
|
|
|
|
|
| 454 |
)
|
| 455 |
-
progress_callback(processed_count, dynamic_total, skip_reason)
|
| 456 |
continue
|
| 457 |
|
|
|
|
|
|
|
|
|
|
| 458 |
self.logger.info(
|
| 459 |
-
f"Crawling (Depth {current_depth}): {current_url} ({processed_count + 1}/{
|
| 460 |
)
|
| 461 |
|
| 462 |
-
if progress_callback:
|
| 463 |
-
progress_callback(
|
| 464 |
-
processed_count, current_total_for_progress, current_url
|
| 465 |
-
)
|
| 466 |
-
|
| 467 |
self.visited_urls.add(current_url)
|
| 468 |
-
|
| 469 |
self.rate_limiter.wait()
|
| 470 |
|
| 471 |
try:
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
html_content = response.text
|
| 475 |
-
soup = BeautifulSoup(html_content, "html.parser")
|
| 476 |
-
|
| 477 |
-
# Extract metadata using helper method
|
| 478 |
-
page_title, meta_description, meta_keywords = (
|
| 479 |
-
self._extract_page_metadata(soup, current_url)
|
| 480 |
-
)
|
| 481 |
-
|
| 482 |
-
text_content = self._extract_text(soup)
|
| 483 |
-
|
| 484 |
-
page_data = CrawledPage(
|
| 485 |
-
url=current_url,
|
| 486 |
-
html_content=html_content,
|
| 487 |
-
text_content=text_content,
|
| 488 |
-
title=page_title,
|
| 489 |
-
meta_description=meta_description,
|
| 490 |
-
meta_keywords=meta_keywords,
|
| 491 |
-
crawl_depth=current_depth,
|
| 492 |
-
parent_url=current_parent_url,
|
| 493 |
)
|
| 494 |
crawled_pages.append(page_data)
|
| 495 |
self.logger.info(f"Successfully processed and stored: {current_url}")
|
| 496 |
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
self.logger.debug(
|
| 500 |
-
f"Found {len(found_links)} links on {current_url}"
|
| 501 |
-
)
|
| 502 |
-
for link in found_links:
|
| 503 |
-
if link not in self.visited_urls:
|
| 504 |
-
urls_to_visit.append((link, current_depth + 1, current_url))
|
| 505 |
-
|
| 506 |
-
except requests.exceptions.HTTPError as e:
|
| 507 |
-
self.logger.error(
|
| 508 |
-
f"HTTPError for {current_url}: {e.response.status_code} - {e.response.reason}. Response: {e.response.text[:200]}...",
|
| 509 |
-
exc_info=False,
|
| 510 |
-
)
|
| 511 |
-
processed_count += 1
|
| 512 |
-
except requests.exceptions.ConnectionError as e:
|
| 513 |
-
self.logger.error(
|
| 514 |
-
f"ConnectionError for {current_url}: {e}", exc_info=False
|
| 515 |
-
)
|
| 516 |
-
processed_count += 1
|
| 517 |
-
except requests.exceptions.Timeout as e:
|
| 518 |
-
self.logger.error(f"Timeout for {current_url}: {e}", exc_info=False)
|
| 519 |
-
processed_count += 1
|
| 520 |
-
except requests.exceptions.RequestException as e:
|
| 521 |
-
self.logger.error(
|
| 522 |
-
f"RequestException for {current_url}: {e}", exc_info=True
|
| 523 |
)
|
| 524 |
-
|
| 525 |
except Exception as e:
|
| 526 |
-
self.
|
| 527 |
-
f"An unexpected error occurred while processing {current_url}: {e}",
|
| 528 |
-
exc_info=True,
|
| 529 |
-
)
|
| 530 |
processed_count += 1
|
|
|
|
|
|
|
|
|
|
| 531 |
|
| 532 |
self.logger.info(
|
| 533 |
-
f"Crawl completed. Total pages processed/attempted: {processed_count}.
|
|
|
|
| 534 |
)
|
| 535 |
if progress_callback:
|
| 536 |
progress_callback(processed_count, processed_count, "Crawling complete.")
|
|
|
|
| 418 |
|
| 419 |
return False, None
|
| 420 |
|
| 421 |
+
def _calculate_progress_total(
|
| 422 |
+
self, processed_count: int, urls_to_visit_len: int, initial_total: int
|
| 423 |
+
) -> int:
|
| 424 |
+
"""Calculate the total for progress reporting."""
|
| 425 |
+
if self.use_sitemap:
|
| 426 |
+
return initial_total
|
| 427 |
+
return processed_count + urls_to_visit_len + 1
|
| 428 |
+
|
| 429 |
+
def _update_crawl_progress(
|
| 430 |
+
self,
|
| 431 |
+
progress_callback: Optional[Callable[[int, int, str], None]],
|
| 432 |
+
processed_count: int,
|
| 433 |
+
urls_to_visit_len: int,
|
| 434 |
+
initial_total: int,
|
| 435 |
+
message: str,
|
| 436 |
+
) -> None:
|
| 437 |
+
"""Update progress callback if provided."""
|
| 438 |
+
if progress_callback:
|
| 439 |
+
total = self._calculate_progress_total(
|
| 440 |
+
processed_count, urls_to_visit_len, initial_total
|
| 441 |
+
)
|
| 442 |
+
progress_callback(processed_count, total, message)
|
| 443 |
+
|
| 444 |
+
def _fetch_and_parse_url(
|
| 445 |
+
self, url: str, depth: int, parent_url: Optional[str]
|
| 446 |
+
) -> Tuple[CrawledPage, BeautifulSoup]:
|
| 447 |
+
"""Fetch URL and create CrawledPage object.
|
| 448 |
+
|
| 449 |
+
Args:
|
| 450 |
+
url: URL to fetch
|
| 451 |
+
depth: Current crawl depth
|
| 452 |
+
parent_url: URL of the parent page
|
| 453 |
+
|
| 454 |
+
Returns:
|
| 455 |
+
Tuple of (CrawledPage, BeautifulSoup) for further processing
|
| 456 |
+
|
| 457 |
+
Raises:
|
| 458 |
+
requests.RequestException: If the HTTP request fails
|
| 459 |
+
"""
|
| 460 |
+
response = self.session.get(url, timeout=10)
|
| 461 |
+
response.raise_for_status()
|
| 462 |
+
html_content = response.text
|
| 463 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 464 |
+
|
| 465 |
+
page_title, meta_description, meta_keywords = self._extract_page_metadata(
|
| 466 |
+
soup, url
|
| 467 |
+
)
|
| 468 |
+
text_content = self._extract_text(soup)
|
| 469 |
+
|
| 470 |
+
return CrawledPage(
|
| 471 |
+
url=url,
|
| 472 |
+
html_content=html_content,
|
| 473 |
+
text_content=text_content,
|
| 474 |
+
title=page_title,
|
| 475 |
+
meta_description=meta_description,
|
| 476 |
+
meta_keywords=meta_keywords,
|
| 477 |
+
crawl_depth=depth,
|
| 478 |
+
parent_url=parent_url,
|
| 479 |
+
), soup
|
| 480 |
+
|
| 481 |
+
def _enqueue_discovered_links(
|
| 482 |
+
self,
|
| 483 |
+
soup: BeautifulSoup,
|
| 484 |
+
current_url: str,
|
| 485 |
+
current_depth: int,
|
| 486 |
+
urls_to_visit: List[Tuple[str, int, Optional[str]]],
|
| 487 |
+
) -> None:
|
| 488 |
+
"""Extract links from page and add unvisited ones to queue."""
|
| 489 |
+
if current_depth >= self.max_depth:
|
| 490 |
+
return
|
| 491 |
+
|
| 492 |
+
found_links = self._extract_links(soup, current_url)
|
| 493 |
+
self.logger.debug(f"Found {len(found_links)} links on {current_url}")
|
| 494 |
+
for link in found_links:
|
| 495 |
+
if link not in self.visited_urls:
|
| 496 |
+
urls_to_visit.append((link, current_depth + 1, current_url))
|
| 497 |
+
|
| 498 |
+
def _handle_crawl_error(self, url: str, error: Exception) -> None:
|
| 499 |
+
"""Log crawl error with appropriate detail level."""
|
| 500 |
+
if isinstance(error, requests.exceptions.HTTPError):
|
| 501 |
+
self.logger.error(
|
| 502 |
+
f"HTTPError for {url}: {error.response.status_code} - {error.response.reason}. "
|
| 503 |
+
f"Response: {error.response.text[:200]}...",
|
| 504 |
+
exc_info=False,
|
| 505 |
+
)
|
| 506 |
+
elif isinstance(error, requests.exceptions.ConnectionError):
|
| 507 |
+
self.logger.error(f"ConnectionError for {url}: {error}", exc_info=False)
|
| 508 |
+
elif isinstance(error, requests.exceptions.Timeout):
|
| 509 |
+
self.logger.error(f"Timeout for {url}: {error}", exc_info=False)
|
| 510 |
+
elif isinstance(error, requests.exceptions.RequestException):
|
| 511 |
+
self.logger.error(f"RequestException for {url}: {error}", exc_info=True)
|
| 512 |
+
else:
|
| 513 |
+
self.logger.error(
|
| 514 |
+
f"An unexpected error occurred while processing {url}: {error}",
|
| 515 |
+
exc_info=True,
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
def crawl(
|
| 519 |
self, progress_callback: Optional[Callable[[int, int, str], None]] = None
|
| 520 |
) -> List[CrawledPage]:
|
| 521 |
+
"""Crawl website starting from the configured URL.
|
| 522 |
+
|
| 523 |
+
Args:
|
| 524 |
+
progress_callback: Optional callback for progress updates (processed, total, message)
|
| 525 |
+
|
| 526 |
+
Returns:
|
| 527 |
+
List of CrawledPage objects for successfully crawled pages
|
| 528 |
+
"""
|
| 529 |
urls_to_visit = self._initialize_crawl_queue()
|
| 530 |
crawled_pages: List[CrawledPage] = []
|
| 531 |
+
initial_total = len(urls_to_visit)
|
|
|
|
| 532 |
processed_count = 0
|
| 533 |
+
|
| 534 |
while urls_to_visit:
|
| 535 |
current_url, current_depth, current_parent_url = urls_to_visit.pop(0)
|
| 536 |
|
| 537 |
+
self._update_crawl_progress(
|
| 538 |
+
progress_callback,
|
| 539 |
+
processed_count,
|
| 540 |
+
len(urls_to_visit),
|
| 541 |
+
initial_total,
|
| 542 |
+
current_url,
|
| 543 |
)
|
| 544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
|
| 546 |
if should_skip:
|
| 547 |
+
if skip_reason:
|
| 548 |
+
self._update_crawl_progress(
|
| 549 |
+
progress_callback,
|
| 550 |
+
processed_count,
|
| 551 |
+
len(urls_to_visit),
|
| 552 |
+
initial_total,
|
| 553 |
+
skip_reason,
|
| 554 |
)
|
|
|
|
| 555 |
continue
|
| 556 |
|
| 557 |
+
total = self._calculate_progress_total(
|
| 558 |
+
processed_count, len(urls_to_visit), initial_total
|
| 559 |
+
)
|
| 560 |
self.logger.info(
|
| 561 |
+
f"Crawling (Depth {current_depth}): {current_url} ({processed_count + 1}/{total})"
|
| 562 |
)
|
| 563 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
self.visited_urls.add(current_url)
|
|
|
|
| 565 |
self.rate_limiter.wait()
|
| 566 |
|
| 567 |
try:
|
| 568 |
+
page_data, soup = self._fetch_and_parse_url(
|
| 569 |
+
current_url, current_depth, current_parent_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
)
|
| 571 |
crawled_pages.append(page_data)
|
| 572 |
self.logger.info(f"Successfully processed and stored: {current_url}")
|
| 573 |
|
| 574 |
+
self._enqueue_discovered_links(
|
| 575 |
+
soup, current_url, current_depth, urls_to_visit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
)
|
| 577 |
+
|
| 578 |
except Exception as e:
|
| 579 |
+
self._handle_crawl_error(current_url, e)
|
|
|
|
|
|
|
|
|
|
| 580 |
processed_count += 1
|
| 581 |
+
continue
|
| 582 |
+
|
| 583 |
+
processed_count += 1
|
| 584 |
|
| 585 |
self.logger.info(
|
| 586 |
+
f"Crawl completed. Total pages processed/attempted: {processed_count}. "
|
| 587 |
+
f"Successfully crawled pages: {len(crawled_pages)}"
|
| 588 |
)
|
| 589 |
if progress_callback:
|
| 590 |
progress_callback(processed_count, processed_count, "Crawling complete.")
|
ankigen_core/ui_logic.py
CHANGED
|
@@ -3,7 +3,9 @@
|
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd # Needed for use_selected_subjects type hinting
|
| 5 |
from typing import (
|
|
|
|
| 6 |
List,
|
|
|
|
| 7 |
Tuple,
|
| 8 |
)
|
| 9 |
from urllib.parse import urlparse
|
|
@@ -12,7 +14,7 @@ from urllib.parse import urlparse
|
|
| 12 |
import re # For URL validation and filename sanitization
|
| 13 |
import asyncio
|
| 14 |
|
| 15 |
-
from ankigen_core.crawler import WebCrawler
|
| 16 |
from ankigen_core.llm_interface import (
|
| 17 |
OpenAIClientManager,
|
| 18 |
)
|
|
@@ -436,6 +438,132 @@ def _basic_sanitize_filename(name: str) -> str:
|
|
| 436 |
return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)
|
| 437 |
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
async def crawl_and_generate(
|
| 440 |
url: str,
|
| 441 |
max_depth: int,
|
|
@@ -453,145 +581,46 @@ async def crawl_and_generate(
|
|
| 453 |
status_textbox: gr.Textbox,
|
| 454 |
) -> Tuple[str, List[dict], List[Card]]:
|
| 455 |
"""Crawls a website, generates Anki cards, and prepares them for export/display."""
|
| 456 |
-
# Initialize crawler_ui_logger if it's meant to be used here, e.g., at the start of the function
|
| 457 |
-
# For now, assuming it's available in the scope (e.g., global or passed in if it were a class)
|
| 458 |
-
# If it's a module-level logger, it should be fine.
|
| 459 |
-
|
| 460 |
-
# Ensure the status_textbox is updated via gr.Info or similar if needed
|
| 461 |
-
# as it's a parameter but not directly used for output updates in the provided snippet.
|
| 462 |
-
# It might be used by side-effect if gr.Info/gr.Warning updates it globally, or if it's part of `progress`.
|
| 463 |
-
|
| 464 |
-
# The `status_textbox` parameter is not directly used to set a value in the return,
|
| 465 |
-
# but `gr.Info` might update a default status area, or it's for other UI purposes.
|
| 466 |
-
|
| 467 |
crawler_ui_logger.info(f"Crawl and generate called for URL: {url}")
|
| 468 |
-
|
| 469 |
-
|
| 470 |
return "Invalid URL", [], []
|
| 471 |
|
| 472 |
try:
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]
|
| 481 |
-
|
| 482 |
-
# WebCrawler instantiation updated to remove parameters causing issues.
|
| 483 |
-
# The WebCrawler will use its defaults or other configured ways for these.
|
| 484 |
-
# The 'requests_per_second' from UI maps to 'delay_between_requests' internally if crawler supports it,
|
| 485 |
-
# but since 'delay_between_requests' was also flagged, we remove it.
|
| 486 |
-
# The WebCrawler class itself needs to be checked for its actual constructor parameters.
|
| 487 |
-
crawler = WebCrawler(
|
| 488 |
-
start_url=url,
|
| 489 |
-
max_depth=max_depth, # Assuming max_depth is still a valid param
|
| 490 |
-
# allowed_domains=[domain], # Removed based on linter error
|
| 491 |
-
# delay_between_requests=1.0 / crawler_requests_per_second # Removed
|
| 492 |
-
# if crawler_requests_per_second > 0
|
| 493 |
-
# else 0.1,
|
| 494 |
-
# max_pages=500, # Removed
|
| 495 |
-
include_patterns=include_list, # Assuming this is valid
|
| 496 |
-
exclude_patterns=exclude_list, # Assuming this is valid
|
| 497 |
-
use_sitemap=use_sitemap, # Assuming this is valid
|
| 498 |
-
sitemap_url=sitemap_url_str
|
| 499 |
-
if use_sitemap and sitemap_url_str and sitemap_url_str.strip()
|
| 500 |
-
else None,
|
| 501 |
-
)
|
| 502 |
-
|
| 503 |
-
total_urls_for_progress = 0
|
| 504 |
-
|
| 505 |
-
def crawler_progress_callback(
|
| 506 |
-
processed_count: int, total_urls: int, current_url_processing: str
|
| 507 |
-
):
|
| 508 |
-
nonlocal total_urls_for_progress
|
| 509 |
-
total_urls_for_progress = total_urls
|
| 510 |
-
if total_urls_for_progress > 0:
|
| 511 |
-
progress(
|
| 512 |
-
0.1 + (processed_count / total_urls_for_progress) * 0.4,
|
| 513 |
-
desc=f"Crawling: {processed_count}/{total_urls_for_progress} URLs. Current: {current_url_processing}",
|
| 514 |
-
)
|
| 515 |
-
else:
|
| 516 |
-
progress(
|
| 517 |
-
0.1 + processed_count * 0.01,
|
| 518 |
-
desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url_processing}",
|
| 519 |
-
)
|
| 520 |
-
|
| 521 |
-
crawler_ui_logger.info(f"Starting crawl for {url}...")
|
| 522 |
-
progress(0.15, desc=f"Starting crawl for {url}...")
|
| 523 |
-
crawled_pages = await asyncio.to_thread(
|
| 524 |
-
crawler.crawl, progress_callback=crawler_progress_callback
|
| 525 |
)
|
| 526 |
-
crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.")
|
| 527 |
-
progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.")
|
| 528 |
|
|
|
|
| 529 |
if not crawled_pages:
|
| 530 |
progress(1.0, desc="No pages were crawled. Check URL and patterns.")
|
| 531 |
-
# Return structure: (status_message, df_data, raw_cards_data)
|
| 532 |
return (
|
| 533 |
"No pages were crawled. Check URL and patterns.",
|
| 534 |
pd.DataFrame().to_dict(orient="records"),
|
| 535 |
[],
|
| 536 |
)
|
| 537 |
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
await orchestrator.initialize("dummy-key") # Key already in client_manager
|
| 544 |
-
|
| 545 |
-
# Combine all crawled content into a single context
|
| 546 |
-
combined_content = "\n\n--- PAGE BREAK ---\n\n".join(
|
| 547 |
-
[
|
| 548 |
-
f"URL: {page.url}\nTitle: {page.title}\nContent: {page.text_content[:2000]}..."
|
| 549 |
-
for page in crawled_pages[
|
| 550 |
-
:10
|
| 551 |
-
] # Limit to first 10 pages to avoid token limits
|
| 552 |
-
]
|
| 553 |
-
)
|
| 554 |
-
|
| 555 |
-
context = {
|
| 556 |
-
"source_text": combined_content,
|
| 557 |
-
"crawl_source": url,
|
| 558 |
-
"pages_crawled": len(crawled_pages),
|
| 559 |
-
}
|
| 560 |
-
|
| 561 |
-
progress(0.6, desc="🤖 Processing with agent system...")
|
| 562 |
-
|
| 563 |
-
# Generate cards with agents
|
| 564 |
-
agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
|
| 565 |
-
topic=f"Content from {url}",
|
| 566 |
-
subject="web_content",
|
| 567 |
-
num_cards=min(len(crawled_pages) * 3, 50), # 3 cards per page, max 50
|
| 568 |
-
difficulty="intermediate",
|
| 569 |
-
enable_quality_pipeline=True,
|
| 570 |
-
context=context,
|
| 571 |
)
|
| 572 |
|
| 573 |
if agent_cards:
|
| 574 |
-
progress(0.9, desc=f"🤖 Agent system generated {len(agent_cards)} cards")
|
| 575 |
-
|
| 576 |
cards_for_dataframe_export = generate_cards_from_crawled_content(
|
| 577 |
agent_cards
|
| 578 |
)
|
| 579 |
-
|
| 580 |
-
final_message = f"🤖 Agent system processed content from {len(crawled_pages)} pages. Generated {len(agent_cards)} high-quality cards."
|
| 581 |
progress(1.0, desc=final_message)
|
| 582 |
-
|
| 583 |
-
return (
|
| 584 |
-
final_message,
|
| 585 |
-
cards_for_dataframe_export,
|
| 586 |
-
agent_cards,
|
| 587 |
-
)
|
| 588 |
else:
|
| 589 |
-
progress(1.0, desc=
|
| 590 |
-
return (
|
| 591 |
-
"Agent system returned no cards",
|
| 592 |
-
pd.DataFrame().to_dict(orient="records"),
|
| 593 |
-
[],
|
| 594 |
-
)
|
| 595 |
|
| 596 |
except ConnectionError as e:
|
| 597 |
crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True)
|
|
@@ -618,14 +647,6 @@ async def crawl_and_generate(
|
|
| 618 |
[],
|
| 619 |
)
|
| 620 |
|
| 621 |
-
final_message = f"Content crawled and processed. {len(cards_for_dataframe_export) if cards_for_dataframe_export else 0} potential cards prepared. Load them into the main table for review and export."
|
| 622 |
-
progress(1.0, desc=final_message)
|
| 623 |
-
return (
|
| 624 |
-
final_message,
|
| 625 |
-
cards_for_dataframe_export,
|
| 626 |
-
agent_cards,
|
| 627 |
-
) # agent_cards is List[Card]
|
| 628 |
-
|
| 629 |
|
| 630 |
# --- Card Preview and Editing Utilities (Task 13.3) ---
|
| 631 |
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd # Needed for use_selected_subjects type hinting
|
| 5 |
from typing import (
|
| 6 |
+
Callable,
|
| 7 |
List,
|
| 8 |
+
Optional,
|
| 9 |
Tuple,
|
| 10 |
)
|
| 11 |
from urllib.parse import urlparse
|
|
|
|
| 14 |
import re # For URL validation and filename sanitization
|
| 15 |
import asyncio
|
| 16 |
|
| 17 |
+
from ankigen_core.crawler import CrawledPage, WebCrawler
|
| 18 |
from ankigen_core.llm_interface import (
|
| 19 |
OpenAIClientManager,
|
| 20 |
)
|
|
|
|
| 438 |
return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)
|
| 439 |
|
| 440 |
|
| 441 |
+
def _validate_crawl_url(url: str) -> bool:
|
| 442 |
+
"""Validate URL for crawling."""
|
| 443 |
+
if not url or not url.startswith(("http://", "https://")):
|
| 444 |
+
gr.Warning("Invalid URL provided. Please enter a valid http/https URL.")
|
| 445 |
+
return False
|
| 446 |
+
try:
|
| 447 |
+
urlparse(url)
|
| 448 |
+
return True
|
| 449 |
+
except Exception:
|
| 450 |
+
return False
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
def _create_web_crawler(
|
| 454 |
+
url: str,
|
| 455 |
+
max_depth: int,
|
| 456 |
+
include_patterns: str,
|
| 457 |
+
exclude_patterns: str,
|
| 458 |
+
use_sitemap: bool,
|
| 459 |
+
sitemap_url_str: str,
|
| 460 |
+
) -> WebCrawler:
|
| 461 |
+
"""Create configured WebCrawler instance."""
|
| 462 |
+
include_list = [p.strip() for p in include_patterns.split(",") if p.strip()]
|
| 463 |
+
exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]
|
| 464 |
+
|
| 465 |
+
return WebCrawler(
|
| 466 |
+
start_url=url,
|
| 467 |
+
max_depth=max_depth,
|
| 468 |
+
include_patterns=include_list,
|
| 469 |
+
exclude_patterns=exclude_list,
|
| 470 |
+
use_sitemap=use_sitemap,
|
| 471 |
+
sitemap_url=sitemap_url_str
|
| 472 |
+
if use_sitemap and sitemap_url_str.strip()
|
| 473 |
+
else None,
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
def _create_crawl_progress_callback(
|
| 478 |
+
progress: gr.Progress,
|
| 479 |
+
) -> Tuple[Callable[[int, int, str], None], List[int]]:
|
| 480 |
+
"""Create progress callback for crawler with mutable state container."""
|
| 481 |
+
total_urls_container = [0] # Mutable container for nonlocal-like behavior
|
| 482 |
+
|
| 483 |
+
def callback(processed_count: int, total_urls: int, current_url: str):
|
| 484 |
+
total_urls_container[0] = total_urls
|
| 485 |
+
if total_urls_container[0] > 0:
|
| 486 |
+
progress(
|
| 487 |
+
0.1 + (processed_count / total_urls_container[0]) * 0.4,
|
| 488 |
+
desc=f"Crawling: {processed_count}/{total_urls_container[0]} URLs. Current: {current_url}",
|
| 489 |
+
)
|
| 490 |
+
else:
|
| 491 |
+
progress(
|
| 492 |
+
0.1 + processed_count * 0.01,
|
| 493 |
+
desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url}",
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
return callback, total_urls_container
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
async def _perform_web_crawl(
|
| 500 |
+
crawler: WebCrawler,
|
| 501 |
+
progress: gr.Progress,
|
| 502 |
+
url: str,
|
| 503 |
+
) -> Optional[List[CrawledPage]]:
|
| 504 |
+
"""Execute web crawl and return pages or None if empty."""
|
| 505 |
+
callback, _ = _create_crawl_progress_callback(progress)
|
| 506 |
+
|
| 507 |
+
crawler_ui_logger.info(f"Starting crawl for {url}...")
|
| 508 |
+
progress(0.15, desc=f"Starting crawl for {url}...")
|
| 509 |
+
|
| 510 |
+
crawled_pages = await asyncio.to_thread(crawler.crawl, progress_callback=callback)
|
| 511 |
+
|
| 512 |
+
crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.")
|
| 513 |
+
progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.")
|
| 514 |
+
|
| 515 |
+
return crawled_pages if crawled_pages else None
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
async def _process_crawled_with_agents(
|
| 519 |
+
crawled_pages: List[CrawledPage],
|
| 520 |
+
client_manager: OpenAIClientManager,
|
| 521 |
+
url: str,
|
| 522 |
+
progress: gr.Progress,
|
| 523 |
+
) -> Tuple[List[Card], str]:
|
| 524 |
+
"""Process crawled content with agent system."""
|
| 525 |
+
crawler_ui_logger.info("Using agent system for web crawling card generation")
|
| 526 |
+
|
| 527 |
+
orchestrator = AgentOrchestrator(client_manager)
|
| 528 |
+
# API key is already configured in client_manager, pass empty string as placeholder
|
| 529 |
+
await orchestrator.initialize("")
|
| 530 |
+
|
| 531 |
+
combined_content = "\n\n--- PAGE BREAK ---\n\n".join(
|
| 532 |
+
[
|
| 533 |
+
f"URL: {page.url}\nTitle: {page.title}\nContent: {page.text_content[:2000]}..."
|
| 534 |
+
for page in crawled_pages[:10]
|
| 535 |
+
]
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
context = {
|
| 539 |
+
"source_text": combined_content,
|
| 540 |
+
"crawl_source": url,
|
| 541 |
+
"pages_crawled": len(crawled_pages),
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
progress(0.6, desc="Processing with agent system...")
|
| 545 |
+
|
| 546 |
+
agent_cards, _ = await orchestrator.generate_cards_with_agents(
|
| 547 |
+
topic=f"Content from {url}",
|
| 548 |
+
subject="web_content",
|
| 549 |
+
num_cards=min(len(crawled_pages) * 3, 50),
|
| 550 |
+
difficulty="intermediate",
|
| 551 |
+
enable_quality_pipeline=True,
|
| 552 |
+
context=context,
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
if agent_cards:
|
| 556 |
+
progress(0.9, desc=f"Agent system generated {len(agent_cards)} cards")
|
| 557 |
+
final_message = (
|
| 558 |
+
f"Agent system processed content from {len(crawled_pages)} pages. "
|
| 559 |
+
f"Generated {len(agent_cards)} high-quality cards."
|
| 560 |
+
)
|
| 561 |
+
else:
|
| 562 |
+
final_message = "Agent system returned no cards"
|
| 563 |
+
|
| 564 |
+
return agent_cards or [], final_message
|
| 565 |
+
|
| 566 |
+
|
| 567 |
async def crawl_and_generate(
|
| 568 |
url: str,
|
| 569 |
max_depth: int,
|
|
|
|
| 581 |
status_textbox: gr.Textbox,
|
| 582 |
) -> Tuple[str, List[dict], List[Card]]:
|
| 583 |
"""Crawls a website, generates Anki cards, and prepares them for export/display."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
crawler_ui_logger.info(f"Crawl and generate called for URL: {url}")
|
| 585 |
+
|
| 586 |
+
if not _validate_crawl_url(url):
|
| 587 |
return "Invalid URL", [], []
|
| 588 |
|
| 589 |
try:
|
| 590 |
+
crawler = _create_web_crawler(
|
| 591 |
+
url,
|
| 592 |
+
max_depth,
|
| 593 |
+
include_patterns,
|
| 594 |
+
exclude_patterns,
|
| 595 |
+
use_sitemap,
|
| 596 |
+
sitemap_url_str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
)
|
|
|
|
|
|
|
| 598 |
|
| 599 |
+
crawled_pages = await _perform_web_crawl(crawler, progress, url)
|
| 600 |
if not crawled_pages:
|
| 601 |
progress(1.0, desc="No pages were crawled. Check URL and patterns.")
|
|
|
|
| 602 |
return (
|
| 603 |
"No pages were crawled. Check URL and patterns.",
|
| 604 |
pd.DataFrame().to_dict(orient="records"),
|
| 605 |
[],
|
| 606 |
)
|
| 607 |
|
| 608 |
+
agent_cards, final_message = await _process_crawled_with_agents(
|
| 609 |
+
crawled_pages,
|
| 610 |
+
client_manager,
|
| 611 |
+
url,
|
| 612 |
+
progress,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
)
|
| 614 |
|
| 615 |
if agent_cards:
|
|
|
|
|
|
|
| 616 |
cards_for_dataframe_export = generate_cards_from_crawled_content(
|
| 617 |
agent_cards
|
| 618 |
)
|
|
|
|
|
|
|
| 619 |
progress(1.0, desc=final_message)
|
| 620 |
+
return final_message, cards_for_dataframe_export, agent_cards
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
else:
|
| 622 |
+
progress(1.0, desc=final_message)
|
| 623 |
+
return final_message, pd.DataFrame().to_dict(orient="records"), []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
|
| 625 |
except ConnectionError as e:
|
| 626 |
crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True)
|
|
|
|
| 647 |
[],
|
| 648 |
)
|
| 649 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
|
| 651 |
# --- Card Preview and Editing Utilities (Task 13.3) ---
|
| 652 |
|