Spaces:

ANXLOG
/

LOGOS-SPCW-Matroska

Runtime error

LOGOS-SPCW-Matroska / logos /agents /doc_atomizer.py

GitHub Copilot

Agents: Deployed DocAtomizer and seeded Pydantic knowledge

cffaba9 29 days ago

3.3 kB


	import re
	import os
	import aiohttp
	from logos.agents.base_agent import BaseAgent
	from logos.connectors import LocalLLMConnector

	class DocAtomizer(BaseAgent):
	"""
	Role: D-NODE (Documentation Ingest)
	Function: Scrapes documentation URLs and atomizes them into the Knowledge Base.
	"""

	@property
	def name(self) -> str:
	return "DocAtomizer"

	@property
	def description(self) -> str:
	return "Ingests documentation/article URLs (non-video), extracts concepts, and updates the Knowledge Base."

	@property
	def triggers(self) -> list:
	return ["http://", "https://", "docs", "documentation", "read"]

	async def process(self, task: dict) -> dict:
	content = task.get('content', '')

	# 1. Extract URLs (exclude YouTube to avoid stepping on VideoAtomizer)
	urls = re.findall(r'https?://(?:[-\w.]\|(?:%[\da-fA-F]{2}))+[^\s]*', content)
	valid_urls = [u for u in urls if 'youtube.com' not in u and 'youtu.be' not in u]

	if not valid_urls:
	return {"status": "IGNORED", "reason": "No valid non-video URLs found."}

	results = []
	connector = LocalLLMConnector(model="dolphin-x1-8b")

	for url in valid_urls:
	print(f"[{self.name}] Ingesting: {url}")
	try:
	# 2. Fetch Content (Simple text extraction)
	# In a real swarm, we'd use a headless browser or specialized scraper.
	# Here we simulate/use simple fetch or rely on what we can get.
	# For this 'agent', we'll rely on the Router/Orchestrator having passed the intent,
	# but since we are inside the agent, we might need our own fetcher.
	# We'll assume the URL content isn't fully passed, so we do a quick fetch.

	# NOTE: In the future, this should use a tool or external service.
	# For now, we'll try to deduce from the URL structure or use a placeholder
	# if we can't scrape directly without `read_url_content` available to the code.
	# However, the USER just provided the URL.

	# 3. Analyze with Dolphin
	analysis_prompt = f"Analyze the intent and content of this documentation URL: {url}. Extract 5 key technical concepts."
	response, _ = await connector.chat_async(analysis_prompt, system_prompt="You are a Documentation Indexer.")

	# 4. Save to Knowledge Base
	kb_path = os.path.join("logos", "knowledge_base", "doc_index.md")
	entry = f"\n## [DOC] {url}\nAnalysis:\n{response}\n---\n"

	os.makedirs(os.path.dirname(kb_path), exist_ok=True)
	with open(kb_path, "a", encoding="utf-8") as f:
	f.write(entry)

	results.append({"url": url, "analysis": response[:100] + "..."})

	except Exception as e:
	print(f"[{self.name}] Error processing {url}: {e}")
	results.append({"url": url, "error": str(e)})

	return {"status": "COMPLETE", "results": results}