OneServerToRuleThemAll / storage /mongo_storage.py
etukurudinesh's picture
add files
feea636
from pymongo import MongoClient
from typing import Dict, List, Optional
import datetime
import os
class MongoStorage:
def __init__(self):
self.client = MongoClient(os.environ.get("mongo_db_uri"))
self.db = self.client[os.environ.get("mongo_db_name")]
self.collection = self.db.scraped_pages
self._create_indexes()
def _create_indexes(self):
"""Create indexes for better query performance"""
self.collection.create_index("url", unique=True)
self.collection.create_index("domain")
self.collection.create_index("timestamp")
self.collection.create_index("content.metadata.title")
def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
"""Store complete page data optimized for LLM consumption"""
document = {
"url": url,
"domain": extracted_data["metadata"]["domain"],
"timestamp": datetime.datetime.utcnow(),
"title": extracted_data["metadata"]["title"],
"description": extracted_data["metadata"]["description"],
# LLM-optimized content structure
"content": {
"text_summary": extracted_data["text_summary"],
"content_blocks": extracted_data["content"],
"headings": extracted_data["metadata"]["headings"],
"structure_info": extracted_data["structure"]
},
# Relationship data
"relationships": {
"internal_links": [link for link in extracted_data["links"] if link["internal"]],
"external_links": [link for link in extracted_data["links"] if not link["internal"]],
"images": extracted_data["images"]
},
# DOM analysis for advanced processing
"dom_analysis": {
"tree_structure": dom_structure["tree"],
"statistics": dom_structure["statistics"],
"semantic_structure": dom_structure["semantic_structure"],
"content_blocks": dom_structure["content_blocks"]
},
# Study-friendly metadata
"study_metadata": {
"reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
"complexity_score": self._calculate_complexity_score(extracted_data),
"content_type": self._identify_content_type(extracted_data),
"key_topics": self._extract_key_topics(extracted_data)
}
}
# Upsert document
result = self.collection.replace_one(
{"url": url},
document,
upsert=True
)
return str(result.upserted_id or result.matched_count)
def get_page_data(self, url: str) -> Optional[Dict]:
"""Retrieve page data by URL"""
return self.collection.find_one({"url": url})
def get_pages_by_domain(self, domain: str) -> List[Dict]:
"""Get all pages from a specific domain"""
return list(self.collection.find({"domain": domain}))
def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
"""Search pages by content for LLM queries"""
search_filter = {
"$or": [
{"title": {"$regex": query, "$options": "i"}},
{"description": {"$regex": query, "$options": "i"}},
{"content.text_summary": {"$regex": query, "$options": "i"}}
]
}
return list(self.collection.find(search_filter).limit(limit))
def _estimate_reading_time(self, text: str) -> int:
"""Estimate reading time in minutes (250 words per minute)"""
word_count = len(text.split())
return max(1, word_count // 250)
def _calculate_complexity_score(self, data: Dict) -> float:
"""Calculate content complexity for LLM processing hints"""
score = 0.0
# Text length factor
text_length = len(data["text_summary"])
score += min(text_length / 1000, 5.0)
# Structure complexity
content_blocks = len(data["content"])
score += min(content_blocks / 10, 3.0)
# Link density
total_links = len(data["links"])
score += min(total_links / 20, 2.0)
return round(score, 2)
def _identify_content_type(self, data: Dict) -> str:
"""Identify content type for LLM processing strategy"""
title = data["metadata"]["title"].lower()
text = data["text_summary"].lower()
if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
return "tutorial"
elif any(word in title or word in text for word in ["news", "article", "report"]):
return "article"
elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
return "documentation"
elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
return "blog_post"
else:
return "general"
def _extract_key_topics(self, data: Dict) -> List[str]:
"""Extract key topics for study organization"""
# Simple keyword extraction from headings and title
topics = set()
# From title
title_words = data["metadata"]["title"].split()
topics.update([word.lower() for word in title_words if len(word) > 3])
# From headings
for heading in data["metadata"]["headings"]:
heading_words = heading["text"].split()
topics.update([word.lower() for word in heading_words if len(word) > 3])
return list(topics)[:10] # Limit to top 10 topics