File size: 7,679 Bytes
ab06d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ae06e3
ab06d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ae06e3
ab06d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# rag_engine.py

import os
import pickle
import numpy as np
import faiss
import requests
import trafilatura
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from flashrank import Ranker, RerankRequest
import logging
import time

# Setup basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class KnowledgeBase:
    """
    High-Performance Local RAG Engine.
    Fusion Logic: User's Robust Extraction + FAISS Vector Storage.
    """
    def __init__(self, index_path="faiss_index.bin", metadata_path="metadata.pkl"):
        self.index_path = index_path
        self.metadata_path = metadata_path
        self.metadata = []
        
        logger.info("πŸ“š Initializing Knowledge Base (RAG Engine)...")
        
        # 1. Embedding Model (384 dim, Fast)
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
        
        # 2. Reranker (Lightweight)
        self.ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="./flashrank_cache")
        
        # 3. Load/Create FAISS Index
        if os.path.exists(self.index_path) and os.path.exists(self.metadata_path):
            try:
                self.index = faiss.read_index(self.index_path)
                with open(self.metadata_path, "rb") as f:
                    self.metadata = pickle.load(f)
                logger.info(f"βœ… Loaded RAG Index ({self.index.ntotal} chunks).")
            except Exception as e:
                logger.error(f"❌ Index load failed: {e}. Resetting.")
                self.create_new_index()
        else:
            self.create_new_index()

    def create_new_index(self):
        self.index = faiss.IndexFlatL2(384) 
        self.metadata = []
        logger.info("πŸ†• Created new empty RAG Index.")

    def fetch_page_content(self, url):
        """
        Robust Fetcher: Tries Trafilatura first, falls back to Requests.
        Logic borrowed from user's successful script.
        """
        # Method A: Trafilatura Fetch
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return downloaded
        
        # Method B: Requests Fallback (User-Agent Spoofing)
        try:
            logger.warning(f"⚠️ Trafilatura fetch failed for {url}, trying requests...")
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ToolboxesAI-Bot/1.0"}
            resp = requests.get(url, timeout=15, headers=headers)
            if resp.status_code == 200:
                return resp.text
        except Exception as e:
            logger.error(f"❌ Requests failed for {url}: {e}")
        
        return None

    def ingest_url(self, url: str):
        """
        Extracts text using Trafilatura (Primary) -> BeautifulSoup (Backup).
        Chunks by PARAGRAPH (\n\n).
        """
        logger.info(f"πŸ•·οΈ Scraping: {url}")
        
        # 1. Fetch
        html_content = self.fetch_page_content(url)
        if not html_content:
            return f"Failed to fetch {url}"

        # 2. Extract Text
        # Strategy A: Trafilatura (Best for articles/docs)
        text = trafilatura.extract(html_content, include_comments=False, include_tables=True, no_fallback=False)
        
        # Strategy B: BeautifulSoup Greedy (If Trafilatura thinks page is empty/nav-only)
        if not text or len(text) < 100:
            logger.info("⚠️ Trafilatura extraction empty. Using Greedy BeautifulSoup.")
            soup = BeautifulSoup(html_content, 'html.parser')
            for element in soup(['script', 'style', 'noscript', 'svg']):
                element.decompose()
            text = soup.get_text(separator='\n\n', strip=True)

        if not text:
            return "No readable text found."

        logger.info(f"πŸ“„ Extracted {len(text)} chars.")

        # 3. Chunking Strategy (User's Logic: Paragraph Split)
        # We split by double newline to preserve paragraph structure.
        raw_chunks = [c.strip() for c in text.split('\n\n') if len(c.strip()) > 50]
        
        # Additional processing: If a paragraph is HUGE (>1000 chars), split it further
        final_chunks = []
        for chunk in raw_chunks:
            if len(chunk) > 1000:
                # Simple split for massive blocks
                for i in range(0, len(chunk), 800):
                    sub_chunk = chunk[i:i+800]
                    final_chunks.append(f"Source: {url} | Content: {sub_chunk}")
            else:
                final_chunks.append(f"Source: {url} | Content: {chunk}")

        if not final_chunks:
            return "Text was too short to chunk."

        # 4. Vectorize & Store
        try:
            embeddings = self.embedder.encode(final_chunks)
            faiss.normalize_L2(embeddings)
            
            self.index.add(np.array(embeddings).astype('float32'))
            self.metadata.extend(final_chunks)
            
            # Save to disk
            faiss.write_index(self.index, self.index_path)
            with open(self.metadata_path, "wb") as f:
                pickle.dump(self.metadata, f)
            
            return f"βœ… Ingested {len(final_chunks)} chunks."
            
        except Exception as e:
            return f"Error vectorizing: {e}"

    def search(self, query: str, top_k: int = 10) -> str:
        """
        Retrieves docs using FAISS -> Reranks using FlashRank.
        """
        if self.index.ntotal == 0:
            return ""

        # 1. Coarse Search (FAISS)
        query_vec = self.embedder.encode([query])
        faiss.normalize_L2(query_vec)
        distances, indices = self.index.search(np.array(query_vec).astype('float32'), top_k)
        
        candidates = []
        for idx in indices[0]:
            if idx != -1 and idx < len(self.metadata):
                candidates.append({"id": int(idx), "text": self.metadata[idx]})
        
        if not candidates:
            return ""

        # 2. Rerank (FlashRank)
        rerank_request = RerankRequest(query=query, passages=candidates)
        results = self.ranker.rerank(rerank_request)
        
        # Return Top 5 re-ranked chunks
        top_results = results[:1]
        return "\n\n".join([f"[Local RAG] {r['text']}" for r in top_results])

local_kb = KnowledgeBase()
# ==========================================
# πŸ› οΈ INGESTION ZONE (RUN THIS TO BUILD DB)
# ==========================================
if __name__ == "__main__":
    kb = local_kb
    
    # Your Verified URLs
    urls = [
        "https://toolboxesai.com", 
        "https://toolboxesai.com/hub",
        "https://toolboxesai.com/app",
        "https://toolboxesai.com/app-guide",
        "https://toolboxesai.com/privacy",
        "https://toolboxesai.com/terms",
        "https://toolboxesai.com/contact",
        "https://toolboxesai.com/about",
        "https://compressorpro.toolboxesai.com",
        "https://compressorpro.toolboxesai.com/resizer",
        "https://compressorpro.toolboxesai.com/enhancer",
        "https://compressorpro.toolboxesai.com/color-grader",
        "https://compressorpro.toolboxesai.com/print",
        "https://compressorpro.toolboxesai.com/user-guide"
    ]
    
    print("\nπŸš€ Starting ROBUST Knowledge Ingestion...")
    print("="*50)
    
    for url in urls:
        result = kb.ingest_url(url)
        print(f"Result: {result}")
        time.sleep(1) # Polite delay

    print("="*50)
    
    # Test
    print("\nπŸ§ͺ Testing Retrieval...")
    test_query = "What is toolboxesai?"
    print(f"Query: {test_query}")
    answer = kb.search(test_query)
    print("-" * 20)
    print(answer)
    print("-" * 20)