File size: 9,266 Bytes
feea636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
from neo4j import GraphDatabase
from typing import Dict, List
from urllib.parse import urlparse
from config.settings import settings

class Neo4jStorage:
    def __init__(self):
        self.driver = GraphDatabase.driver(
            settings.database.neo4j_uri,
            auth=(settings.database.neo4j_user, settings.database.neo4j_password)
        )
        self._create_constraints()
    
    def _create_constraints(self):
        """Create constraints and indexes for better performance"""
        with self.driver.session() as session:
            try:
                session.run("CREATE CONSTRAINT page_url IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE")
                session.run("CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE")
                session.run("CREATE INDEX page_title IF NOT EXISTS FOR (p:Page) ON (p.title)")
            except Exception as e:
                pass  # Constraints might already exist
    
    def store_relationships(self, url: str, extracted_data: Dict, dom_structure: Dict):
        """Store page relationships and structure in Neo4j"""
        with self.driver.session() as session:
            # Create main page node
            self._create_page_node(session, url, extracted_data)
            
            # Create domain relationships
            self._create_domain_relationships(session, url, extracted_data)
            
            # Create content relationships
            self._create_content_relationships(session, url, extracted_data)
            
            # Create link relationships
            self._create_link_relationships(session, url, extracted_data["links"])
            
            # Create DOM structure relationships
            self._create_dom_relationships(session, url, dom_structure)
    
    def _create_page_node(self, session, url: str, data: Dict):
        """Create or update page node with LLM-friendly properties"""
        query = """
        MERGE (p:Page {url: $url})
        SET p.title = $title,
            p.description = $description,
            p.domain = $domain,
            p.content_type = $content_type,
            p.complexity_score = $complexity_score,
            p.reading_time = $reading_time,
            p.word_count = $word_count,
            p.last_scraped = datetime()
        """
        
        session.run(query, {
            "url": url,
            "title": data["metadata"]["title"],
            "description": data["metadata"]["description"],
            "domain": data["metadata"]["domain"],
            "content_type": self._identify_content_type(data),
            "complexity_score": self._calculate_complexity_score(data),
            "reading_time": len(data["text_summary"].split()) // 250,
            "word_count": len(data["text_summary"].split())
        })
    
    def _create_domain_relationships(self, session, url: str, data: Dict):
        """Create domain nodes and relationships"""
        domain = data["metadata"]["domain"]
        
        # Create domain node
        session.run("""
        MERGE (d:Domain {name: $domain})
        SET d.last_updated = datetime()
        """, {"domain": domain})
        
        # Link page to domain
        session.run("""
        MATCH (p:Page {url: $url})
        MATCH (d:Domain {name: $domain})
        MERGE (p)-[:BELONGS_TO]->(d)
        """, {"url": url, "domain": domain})
    
    def _create_content_relationships(self, session, url: str, data: Dict):
        """Create content structure relationships for LLM understanding"""
        # Create topic nodes from headings
        for i, heading in enumerate(data["metadata"]["headings"]):
            session.run("""
            MATCH (p:Page {url: $url})
            MERGE (h:Heading {text: $text, level: $level, page_url: $url})
            SET h.position = $position
            MERGE (p)-[:HAS_HEADING]->(h)
            """, {
                "url": url,
                "text": heading["text"],
                "level": heading["level"],
                "position": i
            })
        
        # Create content block relationships
        for i, block in enumerate(data["content"][:10]):  # Limit for performance
            session.run("""
            MATCH (p:Page {url: $url})
            MERGE (c:ContentBlock {text: $text, page_url: $url, position: $position})
            SET c.tag = $tag,
                c.length = $length
            MERGE (p)-[:HAS_CONTENT]->(c)
            """, {
                "url": url,
                "text": block["text"][:500],  # Truncate for storage
                "tag": block["tag"],
                "length": len(block["text"]),
                "position": i
            })
    
    def _create_link_relationships(self, session, url: str, links: List[Dict]):
        """Create link relationships for navigation understanding"""
        for link in links[:20]:  # Limit for performance
            target_url = link["url"]
            link_text = link["text"]
            is_internal = link["internal"]
            
            # Create target page node (minimal)
            session.run("""
            MERGE (target:Page {url: $target_url})
            SET target.discovered_via = $source_url
            """, {"target_url": target_url, "source_url": url})
            
            # Create relationship
            relationship_type = "LINKS_TO_INTERNAL" if is_internal else "LINKS_TO_EXTERNAL"
            session.run(f"""
            MATCH (source:Page {{url: $source_url}})
            MATCH (target:Page {{url: $target_url}})
            MERGE (source)-[r:{relationship_type}]->(target)
            SET r.link_text = $link_text,
                r.is_internal = $is_internal
            """, {
                "source_url": url,
                "target_url": target_url,
                "link_text": link_text,
                "is_internal": is_internal
            })
    
    def _create_dom_relationships(self, session, url: str, dom_structure: Dict):
        """Create DOM structure relationships for content hierarchy"""
        # Create semantic structure nodes
        semantic_elements = dom_structure["semantic_structure"]["semantic_elements"]
        for tag, count in semantic_elements.items():
            if count > 0:
                session.run("""
                MATCH (p:Page {url: $url})
                MERGE (s:SemanticElement {tag: $tag, page_url: $url})
                SET s.count = $count
                MERGE (p)-[:HAS_SEMANTIC_ELEMENT]->(s)
                """, {"url": url, "tag": tag, "count": count})
    
    def get_page_relationships(self, url: str) -> Dict:
        """Get all relationships for a page for LLM context"""
        with self.driver.session() as session:
            result = session.run("""
            MATCH (p:Page {url: $url})
            OPTIONAL MATCH (p)-[:LINKS_TO_INTERNAL]->(internal:Page)
            OPTIONAL MATCH (p)-[:LINKS_TO_EXTERNAL]->(external:Page)
            OPTIONAL MATCH (p)-[:HAS_HEADING]->(h:Heading)
            RETURN p, collect(DISTINCT internal.url) as internal_links,
                   collect(DISTINCT external.url) as external_links,
                   collect(DISTINCT {text: h.text, level: h.level}) as headings
            """, {"url": url})
            
            record = result.single()
            if record:
                return {
                    "page": dict(record["p"]),
                    "internal_links": record["internal_links"],
                    "external_links": record["external_links"],
                    "headings": record["headings"]
                }
            return {}
    
    def get_related_pages(self, url: str, limit: int = 5) -> List[Dict]:
        """Find related pages for LLM context and study suggestions"""
        with self.driver.session() as session:
            result = session.run("""
            MATCH (p:Page {url: $url})
            MATCH (p)-[:BELONGS_TO]->(d:Domain)
            MATCH (related:Page)-[:BELONGS_TO]->(d)
            WHERE related.url <> $url
            RETURN related.url as url, related.title as title, 
                   related.content_type as content_type,
                   related.complexity_score as complexity_score
            ORDER BY related.complexity_score DESC
            LIMIT $limit
            """, {"url": url, "limit": limit})
            
            return [dict(record) for record in result]
    
    def _identify_content_type(self, data: Dict) -> str:
        """Identify content type for graph relationships"""
        title = data["metadata"]["title"].lower()
        if "tutorial" in title or "guide" in title:
            return "tutorial"
        elif "documentation" in title or "docs" in title:
            return "documentation"
        elif "blog" in title or "article" in title:
            return "article"
        return "general"
    
    def _calculate_complexity_score(self, data: Dict) -> float:
        """Calculate complexity score for relationship weighting"""
        text_length = len(data["text_summary"])
        content_blocks = len(data["content"])
        return min(text_length / 1000 + content_blocks / 10, 10.0)
    
    def close(self):
        """Close database connection"""
        self.driver.close()