OnlyTheTruth03 commited on
Commit
709c859
·
1 Parent(s): cb057ef

Initial RAG bot

Browse files
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Secrets
2
+ .env
3
+ index/*.pkl
4
+ index/*.index
5
+
6
+ # Virtual environment
7
+ venv/
8
+ __pycache__/
9
+
10
+ # OS files
11
+ .DS_Store
12
+ Thumbs.db
13
+
14
+ # Large raw PDFs (optional)
15
+ data/pdfs/
16
+
17
+ # Python build
18
+ *.pyc
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from rag import retrieve, ask_llm
4
+
5
+ # -----------------------------
6
+ # Streamlit Page Config
7
+ # -----------------------------
8
+ st.set_page_config(
9
+ page_title="Only The Truth – Astrology Tutor",
10
+ page_icon="🪐",
11
+ layout="wide"
12
+ )
13
+
14
+ st.title("🪐 Only The Truth – Astrology Tutor")
15
+ st.caption("Vedic Astrology • PDF-Based Knowledge • RAG Powered")
16
+
17
+ # -----------------------------
18
+ # User Input
19
+ # -----------------------------
20
+ query = st.text_input(
21
+ "Ask your astrology question:",
22
+ placeholder="Example: Explain how twins are analyzed using D60 chart"
23
+ )
24
+
25
+ # -----------------------------
26
+ # Process Query
27
+ # -----------------------------
28
+ if query:
29
+ with st.spinner("🔍 Searching ancient wisdom..."):
30
+ contexts = retrieve(query)
31
+
32
+ with st.spinner("🧠 Interpreting charts..."):
33
+ answer = ask_llm(query, contexts)
34
+
35
+ # -----------------------------
36
+ # Answer Section
37
+ # -----------------------------
38
+ st.subheader("🪐 Answer")
39
+ st.markdown(answer)
40
+
41
+ # -----------------------------
42
+ # Image Display Logic (STRICT)
43
+ # -----------------------------
44
+ st.subheader("📘 Reference Diagrams")
45
+
46
+ IMAGE_DIR = "data/images"
47
+ image_found = False
48
+
49
+ query_lower = query.lower()
50
+ KEYWORDS = [
51
+ "chart", "diagram", "lagna", "horoscope",
52
+ "d60", "sashtyamsa", "divisional", "birth chart"
53
+ ]
54
+
55
+ for c in contexts:
56
+ text_lower = c["text"].lower()
57
+
58
+ # Only show images if BOTH query & chunk indicate diagram relevance
59
+ if not any(k in query_lower for k in KEYWORDS):
60
+ continue
61
+
62
+ if not any(k in text_lower for k in KEYWORDS):
63
+ continue
64
+
65
+ for img in c.get("images", []):
66
+ img_path = os.path.join(IMAGE_DIR, img)
67
+
68
+ if os.path.exists(img_path):
69
+ st.image(
70
+ img_path,
71
+ caption=f"{c['source']} — page {c['page']}",
72
+ use_container_width=True
73
+ )
74
+ image_found = True
75
+
76
+ if not image_found:
77
+ st.info("ℹ No relevant diagrams found in the reference material.")
78
+
79
+ # -----------------------------
80
+ # Footer
81
+ # -----------------------------
82
+ st.markdown("---")
83
+ st.caption("Built with FAISS • SentenceTransformers • Groq • Streamlit")
image_gen.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+
4
+ HF_TOKEN = os.getenv("HF_TOKEN")
5
+
6
+ if not HF_TOKEN:
7
+ raise RuntimeError("❌ HF_TOKEN not set")
8
+
9
+ API_URL = "https://router.huggingface.co/hf-inference/models/stabilityai/stable-diffusion-xl-base-1.0"
10
+
11
+ HEADERS = {
12
+ "Authorization": f"Bearer {HF_TOKEN}",
13
+ "Content-Type": "application/json"
14
+ }
15
+
16
+ def generate_image(prompt: str) -> bytes:
17
+ payload = {
18
+ "inputs": prompt,
19
+ "parameters": {
20
+ "width": 1024,
21
+ "height": 1024,
22
+ "num_inference_steps": 30,
23
+ "guidance_scale": 7.5
24
+ }
25
+ }
26
+
27
+ response = requests.post(
28
+ API_URL,
29
+ headers=HEADERS,
30
+ json=payload,
31
+ timeout=120
32
+ )
33
+
34
+ if response.status_code != 200:
35
+ raise RuntimeError(response.text)
36
+
37
+ return response.content
index/documents.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b766710fe0a84473610afdf4676495aa27caff95e65f8dba4b60f70a70854d42
3
+ size 55508
index/faiss.index ADDED
Binary file (49.2 kB). View file
 
ingest.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import pdfplumber
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+
7
+ PDF_DIR = "data"
8
+ IMAGE_DIR = "data/images"
9
+ INDEX_DIR = "index"
10
+
11
+ os.makedirs(IMAGE_DIR, exist_ok=True)
12
+ os.makedirs(INDEX_DIR, exist_ok=True)
13
+
14
+ def chunk_text(text, size=500, overlap=50):
15
+ words = text.split()
16
+ chunks = []
17
+ for i in range(0, len(words), size - overlap):
18
+ chunks.append(" ".join(words[i:i + size]))
19
+ return chunks
20
+
21
+ documents = []
22
+
23
+ print("📥 Processing PDFs...")
24
+
25
+ for pdf_file in os.listdir(PDF_DIR):
26
+ if not pdf_file.endswith(".pdf"):
27
+ continue
28
+
29
+ pdf_path = os.path.join(PDF_DIR, pdf_file)
30
+
31
+ with pdfplumber.open(pdf_path) as pdf:
32
+ for page_num, page in enumerate(pdf.pages, start=1):
33
+
34
+ text = page.extract_text() or ""
35
+ if not text.strip():
36
+ continue
37
+
38
+ # 🔹 Extract images
39
+ image_files = []
40
+ for i, img in enumerate(page.images):
41
+ try:
42
+ x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
43
+ cropped = page.crop((x0, top, x1, bottom))
44
+ img_obj = cropped.to_image(resolution=200)
45
+
46
+ img_name = f"{pdf_file}_p{page_num}_i{i}.png"
47
+ img_path = os.path.join(IMAGE_DIR, img_name)
48
+ img_obj.save(img_path)
49
+
50
+ image_files.append(img_name)
51
+ except Exception:
52
+ pass # skip problematic images
53
+
54
+ # 🔹 Chunk text and attach images
55
+ for chunk in chunk_text(text):
56
+ documents.append({
57
+ "text": chunk,
58
+ "source": pdf_file,
59
+ "page": page_num,
60
+ "images": image_files
61
+ })
62
+
63
+ print(f"📄 Total chunks: {len(documents)}")
64
+
65
+ # 🔹 Embeddings
66
+ print("🧠 Generating embeddings...")
67
+ model = SentenceTransformer("all-MiniLM-L6-v2")
68
+ texts = [doc["text"] for doc in documents]
69
+ embeddings = model.encode(texts, show_progress_bar=True)
70
+
71
+ # 🔹 FAISS
72
+ print("📦 Building FAISS index...")
73
+ index = faiss.IndexFlatL2(embeddings.shape[1])
74
+ index.add(embeddings)
75
+
76
+ faiss.write_index(index, f"{INDEX_DIR}/faiss.index")
77
+
78
+ with open(f"{INDEX_DIR}/documents.pkl", "wb") as f:
79
+ pickle.dump(documents, f)
80
+
81
+ print("✅ Ingestion complete (TEXT + IMAGES)")
ingest_images.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdfplumber
3
+ from PIL import Image
4
+
5
+ PDF_DIR = "data"
6
+ IMAGE_DIR = "data/images"
7
+
8
+ os.makedirs(IMAGE_DIR, exist_ok=True)
9
+
10
+ def extract_images():
11
+ for pdf_file in os.listdir(PDF_DIR):
12
+ if not pdf_file.endswith(".pdf"):
13
+ continue
14
+
15
+ pdf_path = os.path.join(PDF_DIR, pdf_file)
16
+
17
+ with pdfplumber.open(pdf_path) as pdf:
18
+ for page_no, page in enumerate(pdf.pages):
19
+ for img_no, img in enumerate(page.images):
20
+ bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
21
+ cropped = page.crop(bbox).to_image(resolution=300)
22
+
23
+ img_name = f"{pdf_file}_p{page_no}_i{img_no}.png"
24
+ cropped.save(os.path.join(IMAGE_DIR, img_name))
25
+
26
+ extract_images()
kb_builder/auth/stage1.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from playwright.sync_api import sync_playwright
3
+
4
+ def login_to_wix(email):
5
+ password = os.getenv("WIX_PASSWORD")
6
+ if not password:
7
+ raise RuntimeError("WIX_PASSWORD environment variable not set")
8
+
9
+ p = sync_playwright().start()
10
+ browser = p.chromium.launch(
11
+ headless=False,
12
+ slow_mo=50
13
+ )
14
+
15
+ context = browser.new_context()
16
+ page = context.new_page()
17
+
18
+ print("🌐 Opening site...")
19
+ page.goto("https://www.onlythetruth.in/learnastrology", wait_until="domcontentloaded")
20
+ page.wait_for_timeout(4000) # Allow hydration
21
+
22
+ # ---------------- Stage 1: Click top-right Login ----------------
23
+ print("🔑 Clicking top-right Login...")
24
+ login_elem = page.locator('text="Log In"').first
25
+ login_elem.wait_for(state="visible", timeout=10000)
26
+ page.evaluate('(el) => el.click()', login_elem.element_handle())
27
+ page.wait_for_timeout(3000) # Wait for lightbox to appear
28
+ print("✅ Lightbox should appear now.")
29
+
30
+ # ---------------- Stage 2: Get iframe ----------------
31
+ print("🪟 Locating lightbox iframe...")
32
+ frame = page.frame_locator("iframe").first
33
+ frame.locator('input[name="email"]').wait_for(timeout=15000) # Wait for email field
34
+ print("✅ Email input visible inside iframe.")
35
+
36
+ # ---------------- Stage 3: Fill credentials ----------------
37
+ print("✍ Filling email...")
38
+ frame.locator('input[name="email"]').fill(email)
39
+ frame.locator('button:has-text(\"Continue\")').click()
40
+
41
+ print("✍ Filling password...")
42
+ frame.locator('input[name="password"]').wait_for(timeout=15000)
43
+ frame.locator('input[name="password"]').fill(password)
44
+ frame.locator('button:has-text(\"Log In\")').click()
45
+
46
+ print("✅ Login submitted, waiting for auth to complete...")
47
+ page.wait_for_timeout(5000)
48
+
49
+ return browser, page
50
+
51
+ # ---------------- Test ----------------
52
+ if __name__ == "__main__":
53
+ EMAIL = os.getenv("WIX_EMAIL")
54
+ browser, page = login_to_wix(EMAIL)
kb_builder/auth/stage3.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from playwright.sync_api import sync_playwright, TimeoutError
3
+
4
+ LOGIN_URL = "https://www.onlythetruth.in/learnastrology"
5
+
6
+ def login_to_wix(email):
7
+ password = os.getenv("WIX_PASSWORD")
8
+ if not password:
9
+ raise RuntimeError("❌ WIX_PASSWORD not set")
10
+
11
+ with sync_playwright() as p:
12
+ browser = p.chromium.launch(
13
+ headless=False,
14
+ slow_mo=50
15
+ )
16
+
17
+ context = browser.new_context()
18
+ page = context.new_page()
19
+
20
+ # -------------------------------------------------
21
+ # 1. Open site
22
+ # -------------------------------------------------
23
+ print("🌐 Opening site...")
24
+ page.goto(LOGIN_URL, wait_until="domcontentloaded")
25
+ page.wait_for_timeout(4000)
26
+
27
+ # -------------------------------------------------
28
+ # 2. Click top-right Log In (TEXT, not button)
29
+ # -------------------------------------------------
30
+ print("🔑 Clicking top-right Log In...")
31
+ login_text = page.locator('span:has-text("Log In")').first
32
+ login_text.wait_for(timeout=15000)
33
+ page.evaluate("(el) => el.click()", login_text.element_handle())
34
+ page.wait_for_timeout(3000)
35
+
36
+ # -------------------------------------------------
37
+ # 3. Switch to iframe (lightbox)
38
+ # -------------------------------------------------
39
+ print("🪟 Waiting for login iframe...")
40
+ frame = page.frame_locator("iframe").first
41
+
42
+ # -------------------------------------------------
43
+ # 4. FIRST LIGHTBOX → "Already a member? Log In"
44
+ # -------------------------------------------------
45
+ print("➡ Switching to Login mode...")
46
+ try:
47
+ frame.get_by_role(
48
+ "button",
49
+ name="Already a member? Log In"
50
+ ).click(timeout=8000)
51
+ page.wait_for_timeout(2000)
52
+ except TimeoutError:
53
+ print("ℹ Already in login mode")
54
+
55
+ # -------------------------------------------------
56
+ # 5. SECOND LIGHTBOX → "Log in with Email"
57
+ # -------------------------------------------------
58
+ print("📧 Selecting Email login...")
59
+ frame.get_by_role(
60
+ "button",
61
+ name="Log in with Email"
62
+ ).wait_for(timeout=15000)
63
+ frame.get_by_role(
64
+ "button",
65
+ name="Log in with Email"
66
+ ).click()
67
+ page.wait_for_timeout(2000)
68
+
69
+ # -------------------------------------------------
70
+ # 6. FINAL FORM → Email + Password
71
+ # -------------------------------------------------
72
+ print("✍ Filling credentials...")
73
+
74
+ email_input = frame.locator('input[type="email"]')
75
+ email_input.wait_for(timeout=15000)
76
+ email_input.fill(email)
77
+
78
+ password_input = frame.locator('input[type="password"]')
79
+ password_input.wait_for(timeout=15000)
80
+ password_input.fill(password)
81
+
82
+ frame.get_by_role(
83
+ "button",
84
+ name="Log In"
85
+ ).click()
86
+
87
+ print("✅ Login submitted")
88
+ page.wait_for_timeout(5000)
89
+
90
+ return browser, context, page
kb_builder/auth/test_login.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from wix_login import login_to_wix
3
+
4
+ EMAIL = os.getenv("WIX_EMAIL")
5
+
6
+ browser, context, page, playwright = login_to_wix(EMAIL)
7
+
8
+ input("🔒 Browser is open. Press ENTER to close...")
9
+
10
+ browser.close()
11
+ playwright.stop()
kb_builder/auth/test_login1.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from wix_login import login_to_wix
2
+
3
+ EMAIL = "satishdevloper03@gmail.com"
4
+
5
+ browser, context, page = login_to_wix(
6
+ EMAIL,
7
+ headless=False # Keep False for testing to see the flow
8
+ )
9
+
10
+ print("🟢 Logged in successfully")
11
+ input("Press ENTER to close browser...")
12
+
13
+ browser.close()
kb_builder/auth/wix_login.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from playwright.sync_api import sync_playwright, TimeoutError
3
+
4
+ LOGIN_URL = "https://www.onlythetruth.in/learnastrology"
5
+
6
+ def login_to_wix(email):
7
+ password = os.getenv("WIX_PASSWORD")
8
+ if not password:
9
+ raise RuntimeError("❌ WIX_PASSWORD not set")
10
+
11
+ # ❗ DO NOT use `with`
12
+ p = sync_playwright().start()
13
+
14
+ browser = p.chromium.launch(
15
+ headless=False,
16
+ slow_mo=100
17
+ )
18
+
19
+ context = browser.new_context()
20
+ page = context.new_page()
21
+
22
+ print("🌐 Opening site...")
23
+ page.goto(LOGIN_URL, wait_until="domcontentloaded")
24
+ page.wait_for_timeout(5000)
25
+
26
+ print("🔑 Clicking top-right Log In...")
27
+ login_text = page.locator('span:has-text("Log In")').first
28
+ login_text.wait_for(timeout=15000)
29
+
30
+ # JS click is mandatory for Wix
31
+ page.evaluate("(el) => el.click()", login_text.element_handle())
32
+
33
+ print("✅ Login click attempted")
34
+ page.wait_for_timeout(10000) # KEEP BROWSER OPEN
35
+
36
+ return browser, context, page, p
kb_builder/auth/wix_login1.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from playwright.sync_api import sync_playwright, TimeoutError
4
+
5
+ WIX_URL = "https://www.onlythetruth.in/learnastrology"
6
+
7
+ def login_to_wix(email: str, headless: bool = False):
8
+ password = os.getenv("WIX_PASSWORD")
9
+ if not password:
10
+ raise RuntimeError("❌ WIX_PASSWORD env variable not set")
11
+
12
+ with sync_playwright() as p:
13
+ browser = p.chromium.launch(headless=headless, slow_mo=100)
14
+ context = browser.new_context()
15
+ page = context.new_page()
16
+
17
+ print("🌐 Opening site...")
18
+ page.goto(WIX_URL, wait_until="domcontentloaded")
19
+
20
+ # Step 1: Click the top Login button
21
+ print("🔑 Clicking Login button...")
22
+ page.get_by_text("Log In", exact=False).first.click()
23
+ time.sleep(2) # wait for first lightbox
24
+
25
+ # Step 2: Click the "Already a member? Log In" link if visible
26
+ try:
27
+ page.get_by_text("Already a member?", exact=False).click()
28
+ print("ℹ Switched to login mode")
29
+ time.sleep(1)
30
+ except TimeoutError:
31
+ print("ℹ Already in login mode – skipping switch")
32
+
33
+ # Step 3: Click "Log in with Email" button
34
+ try:
35
+ page.get_by_text("Log in with Email", exact=False).click()
36
+ print("ℹ Email login form displayed")
37
+ time.sleep(1)
38
+ except TimeoutError:
39
+ print("ℹ Email form already visible")
40
+
41
+ # Step 4: Fill email and password
42
+ print("✍ Filling credentials...")
43
+ page.locator('input[type="email"]').fill(email)
44
+ page.locator('input[type="password"]').fill(password)
45
+
46
+ # Step 5: Click final Log In button
47
+ print("🚀 Submitting login...")
48
+ page.get_by_role("button", name="Log In").click()
49
+
50
+ # Wait for login to complete
51
+ time.sleep(3)
52
+ print("✅ Login completed")
53
+
54
+ return browser, context, page
55
+
56
+ if __name__ == "__main__":
57
+ EMAIL = os.getenv("WIX_EMAIL")
58
+ browser, context, page = login_to_wix(EMAIL)
kb_builder/chunker/semantic_chunker.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def chunk_text(text, max_chars=1200):
2
+ sentences = text.split(". ")
3
+ chunks, current = [], ""
4
+
5
+
6
+ for s in sentences:
7
+ if len(current) + len(s) < max_chars:
8
+ current += s + ". "
9
+ else:
10
+ chunks.append(current.strip())
11
+ current = s + ". "
12
+
13
+
14
+ if current:
15
+ chunks.append(current.strip())
16
+ return chunks
kb_builder/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wix:
2
+ login_url: "https://www.wix.com/login"
3
+ email: "satishdevloper03@gmail.com"
4
+
5
+
6
+
7
+ courses:
8
+ - course_id: "vedic_astrology_lvl1"
9
+ course_name: "Vedic Astrology Level 1"
10
+ paid: false
11
+ lesson_urls:
12
+ - "https://yoursite.com/lesson-1"
13
+ - "https://yoursite.com/lesson-2"
14
+
15
+
16
+ - course_id: "vedic_astrology_lvl2"
17
+ course_name: "Vedic Astrology Level 2"
18
+ paid: true
19
+ lesson_urls:
20
+ - "https://yoursite.com/lesson-10"
21
+
22
+
23
+ output:
24
+ vector_store_dir: "vector_store"
kb_builder/embeddings/embedder.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+
4
+ _model = SentenceTransformer("all-MiniLM-L6-v2")
5
+
6
+
7
+ def embed_texts(texts):
8
+ return _model.encode(texts, show_progress_bar=True)
kb_builder/index/faiss_builder.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import json
3
+ import numpy as np
4
+
5
+
6
+ def build_faiss_index(vectors, metadata, output_dir):
7
+ dim = vectors.shape[1]
8
+ index = faiss.IndexFlatL2(dim)
9
+ index.add(vectors.astype("float32"))
10
+
11
+
12
+ faiss.write_index(index, f"{output_dir}/index.faiss")
13
+
14
+
15
+ with open(f"{output_dir}/metadata.json", "w", encoding="utf-8") as f:
16
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
kb_builder/ocr/image_ocr.py ADDED
File without changes
kb_builder/parser/html_parser.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+
3
+
4
+ def parse_lesson(html):
5
+ soup = BeautifulSoup(html, "lxml")
6
+
7
+
8
+ lesson_title = soup.find("h1").get_text(strip=True)
9
+ sections = []
10
+
11
+
12
+ for sec in soup.find_all(["section", "article"]):
13
+ header = sec.find(["h2", "h3", "h4"])
14
+ text = sec.get_text("\n", strip=True)
15
+
16
+
17
+ if text:
18
+ sections.append({
19
+ "heading": header.get_text(strip=True) if header else "General",
20
+ "text": text
21
+ })
22
+
23
+
24
+ return lesson_title, sections
kb_builder/run_pipeline.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ from auth.wix_login import login_to_wix
4
+ from scraper.lesson_scraper import fetch_lesson_html
5
+ from parser.html_parser import parse_lesson
6
+ from chunker.semantic_chunker import chunk_text
7
+ from embeddings.embedder import embed_texts
8
+ from index.faiss_builder import build_faiss_index
9
+
10
+
11
+ cfg = yaml.safe_load(open("config.yaml"))
12
+
13
+
14
+ browser, page = login_to_wix(
15
+ cfg["wix"]["login_url"],
16
+ cfg["wix"]["email"],
17
+ )
18
+
19
+
20
+ all_chunks = []
21
+ metadata = []
22
+
23
+
24
+ for course in cfg["courses"]:
25
+ for url in course["lesson_urls"]:
26
+ html = fetch_lesson_html(page, url)
27
+ title, sections = parse_lesson(html)
28
+
29
+
30
+ for sec in sections:
31
+ chunks = chunk_text(sec["text"])
32
+ for c in chunks:
33
+ all_chunks.append(c)
34
+ metadata.append({
35
+ "course": course["course_name"],
36
+ "lesson": title,
37
+ "section": sec["heading"],
38
+ "paid": course["paid"],
39
+ "url": url
40
+ })
41
+
42
+
43
+ vectors = embed_texts(all_chunks)
44
+ os.makedirs(cfg["output"]["vector_store_dir"], exist_ok=True)
45
+ build_faiss_index(vectors, metadata, cfg["output"]["vector_store_dir"])
46
+
47
+
48
+ browser.close()
49
+ print("Knowledge base build completed")
kb_builder/scraper/lesson_scraper.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ def fetch_lesson_html(page, url):
2
+ page.goto(url)
3
+ page.wait_for_load_state("networkidle")
4
+ return page.content()
rag.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import faiss
4
+ import numpy as np
5
+ from dotenv import load_dotenv
6
+ from sentence_transformers import SentenceTransformer
7
+ from groq import Groq
8
+
9
+ # ---------------- CONFIG ----------------
10
+ INDEX_DIR = "index"
11
+ TOP_K = 4
12
+
13
+ # ---------------- LOAD ENV ----------------
14
+ load_dotenv()
15
+
16
+ api_key = os.getenv("GROQ_API_KEY")
17
+ if not api_key:
18
+ raise ValueError("❌ GROQ_API_KEY not found in .env")
19
+
20
+ client = Groq(api_key=api_key)
21
+
22
+ # ---------------- LOAD INDEX ----------------
23
+ index = faiss.read_index(f"{INDEX_DIR}/faiss.index")
24
+
25
+ with open(f"{INDEX_DIR}/documents.pkl", "rb") as f:
26
+ documents = pickle.load(f)
27
+
28
+ # ---------------- EMBEDDINGS ----------------
29
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
30
+
31
+
32
+ # ---------------- RETRIEVAL ----------------
33
+ def retrieve(query, top_k=TOP_K):
34
+ query_embedding = embedder.encode([query])
35
+ query_embedding = np.array(query_embedding).astype("float32")
36
+
37
+ distances, indices = index.search(query_embedding, top_k)
38
+
39
+ results = []
40
+ for idx in indices[0]:
41
+ if idx == -1:
42
+ continue
43
+ results.append(documents[idx])
44
+
45
+ return results
46
+
47
+
48
+ # ---------------- LLM ----------------
49
+ def ask_llm(query, contexts):
50
+ context_text = "\n\n".join(
51
+ f"[{c['source']} p.{c['page']}]\n{c['text']}"
52
+ for c in contexts
53
+ )
54
+
55
+ response = client.chat.completions.create(
56
+ model="llama-3.1-8b-instant",
57
+ messages=[
58
+ {
59
+ "role": "system",
60
+ "content": """
61
+ You are an astrology tutor.
62
+ Explain concepts clearly and practically.
63
+ If a chart or diagram from the reference material is useful,
64
+ explicitly say: "Refer to the diagram below."
65
+ Otherwise, do not mention diagrams.
66
+ """
67
+ },
68
+ {
69
+ "role": "user",
70
+ "content": f"""
71
+ Use the following reference material to answer the question.
72
+
73
+ {context_text}
74
+
75
+ Question: {query}
76
+ """
77
+ }
78
+ ],
79
+ temperature=0.2
80
+ )
81
+
82
+ return response.choices[0].message.content