File size: 5,888 Bytes
feea636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from pymongo import MongoClient
from typing import Dict, List, Optional
import datetime
import os

class MongoStorage:
    def __init__(self):
        self.client = MongoClient(os.environ.get("mongo_db_uri"))
        self.db = self.client[os.environ.get("mongo_db_name")]
        self.collection = self.db.scraped_pages
        self._create_indexes()
    
    def _create_indexes(self):
        """Create indexes for better query performance"""
        self.collection.create_index("url", unique=True)
        self.collection.create_index("domain")
        self.collection.create_index("timestamp")
        self.collection.create_index("content.metadata.title")
    
    def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
        """Store complete page data optimized for LLM consumption"""
        document = {
            "url": url,
            "domain": extracted_data["metadata"]["domain"],
            "timestamp": datetime.datetime.utcnow(),
            "title": extracted_data["metadata"]["title"],
            "description": extracted_data["metadata"]["description"],
            
            # LLM-optimized content structure
            "content": {
                "text_summary": extracted_data["text_summary"],
                "content_blocks": extracted_data["content"],
                "headings": extracted_data["metadata"]["headings"],
                "structure_info": extracted_data["structure"]
            },
            
            # Relationship data
            "relationships": {
                "internal_links": [link for link in extracted_data["links"] if link["internal"]],
                "external_links": [link for link in extracted_data["links"] if not link["internal"]],
                "images": extracted_data["images"]
            },
            
            # DOM analysis for advanced processing
            "dom_analysis": {
                "tree_structure": dom_structure["tree"],
                "statistics": dom_structure["statistics"],
                "semantic_structure": dom_structure["semantic_structure"],
                "content_blocks": dom_structure["content_blocks"]
            },
            
            # Study-friendly metadata
            "study_metadata": {
                "reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
                "complexity_score": self._calculate_complexity_score(extracted_data),
                "content_type": self._identify_content_type(extracted_data),
                "key_topics": self._extract_key_topics(extracted_data)
            }
        }
        
        # Upsert document
        result = self.collection.replace_one(
            {"url": url}, 
            document, 
            upsert=True
        )
        
        return str(result.upserted_id or result.matched_count)
    
    def get_page_data(self, url: str) -> Optional[Dict]:
        """Retrieve page data by URL"""
        return self.collection.find_one({"url": url})
    
    def get_pages_by_domain(self, domain: str) -> List[Dict]:
        """Get all pages from a specific domain"""
        return list(self.collection.find({"domain": domain}))
    
    def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
        """Search pages by content for LLM queries"""
        search_filter = {
            "$or": [
                {"title": {"$regex": query, "$options": "i"}},
                {"description": {"$regex": query, "$options": "i"}},
                {"content.text_summary": {"$regex": query, "$options": "i"}}
            ]
        }
        
        return list(self.collection.find(search_filter).limit(limit))
    
    def _estimate_reading_time(self, text: str) -> int:
        """Estimate reading time in minutes (250 words per minute)"""
        word_count = len(text.split())
        return max(1, word_count // 250)
    
    def _calculate_complexity_score(self, data: Dict) -> float:
        """Calculate content complexity for LLM processing hints"""
        score = 0.0
        
        # Text length factor
        text_length = len(data["text_summary"])
        score += min(text_length / 1000, 5.0)
        
        # Structure complexity
        content_blocks = len(data["content"])
        score += min(content_blocks / 10, 3.0)
        
        # Link density
        total_links = len(data["links"])
        score += min(total_links / 20, 2.0)
        
        return round(score, 2)
    
    def _identify_content_type(self, data: Dict) -> str:
        """Identify content type for LLM processing strategy"""
        title = data["metadata"]["title"].lower()
        text = data["text_summary"].lower()
        
        if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
            return "tutorial"
        elif any(word in title or word in text for word in ["news", "article", "report"]):
            return "article"
        elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
            return "documentation"
        elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
            return "blog_post"
        else:
            return "general"
    
    def _extract_key_topics(self, data: Dict) -> List[str]:
        """Extract key topics for study organization"""
        # Simple keyword extraction from headings and title
        topics = set()
        
        # From title
        title_words = data["metadata"]["title"].split()
        topics.update([word.lower() for word in title_words if len(word) > 3])
        
        # From headings
        for heading in data["metadata"]["headings"]:
            heading_words = heading["text"].split()
            topics.update([word.lower() for word in heading_words if len(word) > 3])
        
        return list(topics)[:10]  # Limit to top 10 topics