from pymongo import MongoClient from typing import Dict, List, Optional import datetime import os class MongoStorage: def __init__(self): self.client = MongoClient(os.environ.get("mongo_db_uri")) self.db = self.client[os.environ.get("mongo_db_name")] self.collection = self.db.scraped_pages self._create_indexes() def _create_indexes(self): """Create indexes for better query performance""" self.collection.create_index("url", unique=True) self.collection.create_index("domain") self.collection.create_index("timestamp") self.collection.create_index("content.metadata.title") def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str: """Store complete page data optimized for LLM consumption""" document = { "url": url, "domain": extracted_data["metadata"]["domain"], "timestamp": datetime.datetime.utcnow(), "title": extracted_data["metadata"]["title"], "description": extracted_data["metadata"]["description"], # LLM-optimized content structure "content": { "text_summary": extracted_data["text_summary"], "content_blocks": extracted_data["content"], "headings": extracted_data["metadata"]["headings"], "structure_info": extracted_data["structure"] }, # Relationship data "relationships": { "internal_links": [link for link in extracted_data["links"] if link["internal"]], "external_links": [link for link in extracted_data["links"] if not link["internal"]], "images": extracted_data["images"] }, # DOM analysis for advanced processing "dom_analysis": { "tree_structure": dom_structure["tree"], "statistics": dom_structure["statistics"], "semantic_structure": dom_structure["semantic_structure"], "content_blocks": dom_structure["content_blocks"] }, # Study-friendly metadata "study_metadata": { "reading_time": self._estimate_reading_time(extracted_data["text_summary"]), "complexity_score": self._calculate_complexity_score(extracted_data), "content_type": self._identify_content_type(extracted_data), "key_topics": self._extract_key_topics(extracted_data) } } # Upsert document result = self.collection.replace_one( {"url": url}, document, upsert=True ) return str(result.upserted_id or result.matched_count) def get_page_data(self, url: str) -> Optional[Dict]: """Retrieve page data by URL""" return self.collection.find_one({"url": url}) def get_pages_by_domain(self, domain: str) -> List[Dict]: """Get all pages from a specific domain""" return list(self.collection.find({"domain": domain})) def search_pages(self, query: str, limit: int = 10) -> List[Dict]: """Search pages by content for LLM queries""" search_filter = { "$or": [ {"title": {"$regex": query, "$options": "i"}}, {"description": {"$regex": query, "$options": "i"}}, {"content.text_summary": {"$regex": query, "$options": "i"}} ] } return list(self.collection.find(search_filter).limit(limit)) def _estimate_reading_time(self, text: str) -> int: """Estimate reading time in minutes (250 words per minute)""" word_count = len(text.split()) return max(1, word_count // 250) def _calculate_complexity_score(self, data: Dict) -> float: """Calculate content complexity for LLM processing hints""" score = 0.0 # Text length factor text_length = len(data["text_summary"]) score += min(text_length / 1000, 5.0) # Structure complexity content_blocks = len(data["content"]) score += min(content_blocks / 10, 3.0) # Link density total_links = len(data["links"]) score += min(total_links / 20, 2.0) return round(score, 2) def _identify_content_type(self, data: Dict) -> str: """Identify content type for LLM processing strategy""" title = data["metadata"]["title"].lower() text = data["text_summary"].lower() if any(word in title or word in text for word in ["tutorial", "guide", "how to"]): return "tutorial" elif any(word in title or word in text for word in ["news", "article", "report"]): return "article" elif any(word in title or word in text for word in ["documentation", "docs", "reference"]): return "documentation" elif any(word in title or word in text for word in ["blog", "post", "opinion"]): return "blog_post" else: return "general" def _extract_key_topics(self, data: Dict) -> List[str]: """Extract key topics for study organization""" # Simple keyword extraction from headings and title topics = set() # From title title_words = data["metadata"]["title"].split() topics.update([word.lower() for word in title_words if len(word) > 3]) # From headings for heading in data["metadata"]["headings"]: heading_words = heading["text"].split() topics.update([word.lower() for word in heading_words if len(word) > 3]) return list(topics)[:10] # Limit to top 10 topics