Spaces:

OnlyTheTruth03
/

OTT_Bot

Sleeping

App Files Files Community

OnlyTheTruth03 commited on Dec 16, 2025

Commit

709c859

1 Parent(s): cb057ef

Initial RAG bot

Browse files

Files changed (22) hide show

.gitignore +18 -0
app.py +83 -0
image_gen.py +37 -0
index/documents.pkl +3 -0
index/faiss.index +0 -0
ingest.py +81 -0
ingest_images.py +26 -0
kb_builder/auth/stage1.py +54 -0
kb_builder/auth/stage3.py +90 -0
kb_builder/auth/test_login.py +11 -0
kb_builder/auth/test_login1.py +13 -0
kb_builder/auth/wix_login.py +36 -0
kb_builder/auth/wix_login1.py +58 -0
kb_builder/chunker/semantic_chunker.py +16 -0
kb_builder/config.yaml +24 -0
kb_builder/embeddings/embedder.py +8 -0
kb_builder/index/faiss_builder.py +16 -0
kb_builder/ocr/image_ocr.py +0 -0
kb_builder/parser/html_parser.py +24 -0
kb_builder/run_pipeline.py +49 -0
kb_builder/scraper/lesson_scraper.py +4 -0
rag.py +82 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+# Secrets
+.env
+index/*.pkl
+index/*.index
+# Virtual environment
+venv/
+__pycache__/
+# OS files
+.DS_Store
+Thumbs.db
+# Large raw PDFs (optional)
+data/pdfs/
+# Python build
+*.pyc

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import streamlit as st
+from rag import retrieve, ask_llm
+# -----------------------------
+# Streamlit Page Config
+# -----------------------------
+st.set_page_config(
+    page_title="Only The Truth – Astrology Tutor",
+    page_icon="🪐",
+    layout="wide"
+)
+st.title("🪐 Only The Truth – Astrology Tutor")
+st.caption("Vedic Astrology • PDF-Based Knowledge • RAG Powered")
+# -----------------------------
+# User Input
+# -----------------------------
+query = st.text_input(
+    "Ask your astrology question:",
+    placeholder="Example: Explain how twins are analyzed using D60 chart"
+)
+# -----------------------------
+# Process Query
+# -----------------------------
+if query:
+    with st.spinner("🔍 Searching ancient wisdom..."):
+        contexts = retrieve(query)
+    with st.spinner("🧠 Interpreting charts..."):
+        answer = ask_llm(query, contexts)
+    # -----------------------------
+    # Answer Section
+    # -----------------------------
+    st.subheader("🪐 Answer")
+    st.markdown(answer)
+    # -----------------------------
+    # Image Display Logic (STRICT)
+    # -----------------------------
+    st.subheader("📘 Reference Diagrams")
+    IMAGE_DIR = "data/images"
+    image_found = False
+    query_lower = query.lower()
+    KEYWORDS = [
+        "chart", "diagram", "lagna", "horoscope",
+        "d60", "sashtyamsa", "divisional", "birth chart"
+    ]
+    for c in contexts:
+        text_lower = c["text"].lower()
+        # Only show images if BOTH query & chunk indicate diagram relevance
+        if not any(k in query_lower for k in KEYWORDS):
+            continue
+        if not any(k in text_lower for k in KEYWORDS):
+            continue
+        for img in c.get("images", []):
+            img_path = os.path.join(IMAGE_DIR, img)
+            if os.path.exists(img_path):
+                st.image(
+                    img_path,
+                    caption=f"{c['source']} — page {c['page']}",
+                    use_container_width=True
+                )
+                image_found = True
+    if not image_found:
+        st.info("ℹ No relevant diagrams found in the reference material.")
+# -----------------------------
+# Footer
+# -----------------------------
+st.markdown("---")
+st.caption("Built with FAISS • SentenceTransformers • Groq • Streamlit")

image_gen.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import requests
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise RuntimeError("❌ HF_TOKEN not set")
+API_URL = "https://router.huggingface.co/hf-inference/models/stabilityai/stable-diffusion-xl-base-1.0"
+HEADERS = {
+    "Authorization": f"Bearer {HF_TOKEN}",
+    "Content-Type": "application/json"
+}
+def generate_image(prompt: str) -> bytes:
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "width": 1024,
+            "height": 1024,
+            "num_inference_steps": 30,
+            "guidance_scale": 7.5
+        }
+    }
+    response = requests.post(
+        API_URL,
+        headers=HEADERS,
+        json=payload,
+        timeout=120
+    )
+    if response.status_code != 200:
+        raise RuntimeError(response.text)
+    return response.content

index/documents.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b766710fe0a84473610afdf4676495aa27caff95e65f8dba4b60f70a70854d42
+size 55508

index/faiss.index ADDED Viewed

Binary file (49.2 kB). View file

ingest.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import pickle
+import pdfplumber
+from sentence_transformers import SentenceTransformer
+import faiss
+PDF_DIR = "data"
+IMAGE_DIR = "data/images"
+INDEX_DIR = "index"
+os.makedirs(IMAGE_DIR, exist_ok=True)
+os.makedirs(INDEX_DIR, exist_ok=True)
+def chunk_text(text, size=500, overlap=50):
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), size - overlap):
+        chunks.append(" ".join(words[i:i + size]))
+    return chunks
+documents = []
+print("📥 Processing PDFs...")
+for pdf_file in os.listdir(PDF_DIR):
+    if not pdf_file.endswith(".pdf"):
+        continue
+    pdf_path = os.path.join(PDF_DIR, pdf_file)
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text() or ""
+            if not text.strip():
+                continue
+            # 🔹 Extract images
+            image_files = []
+            for i, img in enumerate(page.images):
+                try:
+                    x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
+                    cropped = page.crop((x0, top, x1, bottom))
+                    img_obj = cropped.to_image(resolution=200)
+                    img_name = f"{pdf_file}_p{page_num}_i{i}.png"
+                    img_path = os.path.join(IMAGE_DIR, img_name)
+                    img_obj.save(img_path)
+                    image_files.append(img_name)
+                except Exception:
+                    pass  # skip problematic images
+            # 🔹 Chunk text and attach images
+            for chunk in chunk_text(text):
+                documents.append({
+                    "text": chunk,
+                    "source": pdf_file,
+                    "page": page_num,
+                    "images": image_files
+                })
+print(f"📄 Total chunks: {len(documents)}")
+# 🔹 Embeddings
+print("🧠 Generating embeddings...")
+model = SentenceTransformer("all-MiniLM-L6-v2")
+texts = [doc["text"] for doc in documents]
+embeddings = model.encode(texts, show_progress_bar=True)
+# 🔹 FAISS
+print("📦 Building FAISS index...")
+index = faiss.IndexFlatL2(embeddings.shape[1])
+index.add(embeddings)
+faiss.write_index(index, f"{INDEX_DIR}/faiss.index")
+with open(f"{INDEX_DIR}/documents.pkl", "wb") as f:
+    pickle.dump(documents, f)
+print("✅ Ingestion complete (TEXT + IMAGES)")

ingest_images.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+import pdfplumber
+from PIL import Image
+PDF_DIR = "data"
+IMAGE_DIR = "data/images"
+os.makedirs(IMAGE_DIR, exist_ok=True)
+def extract_images():
+    for pdf_file in os.listdir(PDF_DIR):
+        if not pdf_file.endswith(".pdf"):
+            continue
+        pdf_path = os.path.join(PDF_DIR, pdf_file)
+        with pdfplumber.open(pdf_path) as pdf:
+            for page_no, page in enumerate(pdf.pages):
+                for img_no, img in enumerate(page.images):
+                    bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
+                    cropped = page.crop(bbox).to_image(resolution=300)
+                    img_name = f"{pdf_file}_p{page_no}_i{img_no}.png"
+                    cropped.save(os.path.join(IMAGE_DIR, img_name))
+extract_images()

kb_builder/auth/stage1.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from playwright.sync_api import sync_playwright
+def login_to_wix(email):
+    password = os.getenv("WIX_PASSWORD")
+    if not password:
+        raise RuntimeError("WIX_PASSWORD environment variable not set")
+    p = sync_playwright().start()
+    browser = p.chromium.launch(
+        headless=False,
+        slow_mo=50
+    )
+    context = browser.new_context()
+    page = context.new_page()
+    print("🌐 Opening site...")
+    page.goto("https://www.onlythetruth.in/learnastrology", wait_until="domcontentloaded")
+    page.wait_for_timeout(4000)  # Allow hydration
+    # ---------------- Stage 1: Click top-right Login ----------------
+    print("🔑 Clicking top-right Login...")
+    login_elem = page.locator('text="Log In"').first
+    login_elem.wait_for(state="visible", timeout=10000)
+    page.evaluate('(el) => el.click()', login_elem.element_handle())
+    page.wait_for_timeout(3000)  # Wait for lightbox to appear
+    print("✅ Lightbox should appear now.")
+    # ---------------- Stage 2: Get iframe ----------------
+    print("🪟 Locating lightbox iframe...")
+    frame = page.frame_locator("iframe").first
+    frame.locator('input[name="email"]').wait_for(timeout=15000)  # Wait for email field
+    print("✅ Email input visible inside iframe.")
+    # ---------------- Stage 3: Fill credentials ----------------
+    print("✍ Filling email...")
+    frame.locator('input[name="email"]').fill(email)
+    frame.locator('button:has-text(\"Continue\")').click()
+    print("✍ Filling password...")
+    frame.locator('input[name="password"]').wait_for(timeout=15000)
+    frame.locator('input[name="password"]').fill(password)
+    frame.locator('button:has-text(\"Log In\")').click()
+    print("✅ Login submitted, waiting for auth to complete...")
+    page.wait_for_timeout(5000)
+    return browser, page
+# ---------------- Test ----------------
+if __name__ == "__main__":
+    EMAIL = os.getenv("WIX_EMAIL")
+    browser, page = login_to_wix(EMAIL)

kb_builder/auth/stage3.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+from playwright.sync_api import sync_playwright, TimeoutError
+LOGIN_URL = "https://www.onlythetruth.in/learnastrology"
+def login_to_wix(email):
+    password = os.getenv("WIX_PASSWORD")
+    if not password:
+        raise RuntimeError("❌ WIX_PASSWORD not set")
+    with sync_playwright() as p:
+        browser = p.chromium.launch(
+            headless=False,
+            slow_mo=50
+        )
+        context = browser.new_context()
+        page = context.new_page()
+        # -------------------------------------------------
+        # 1. Open site
+        # -------------------------------------------------
+        print("🌐 Opening site...")
+        page.goto(LOGIN_URL, wait_until="domcontentloaded")
+        page.wait_for_timeout(4000)
+        # -------------------------------------------------
+        # 2. Click top-right Log In (TEXT, not button)
+        # -------------------------------------------------
+        print("🔑 Clicking top-right Log In...")
+        login_text = page.locator('span:has-text("Log In")').first
+        login_text.wait_for(timeout=15000)
+        page.evaluate("(el) => el.click()", login_text.element_handle())
+        page.wait_for_timeout(3000)
+        # -------------------------------------------------
+        # 3. Switch to iframe (lightbox)
+        # -------------------------------------------------
+        print("🪟 Waiting for login iframe...")
+        frame = page.frame_locator("iframe").first
+        # -------------------------------------------------
+        # 4. FIRST LIGHTBOX → "Already a member? Log In"
+        # -------------------------------------------------
+        print("➡ Switching to Login mode...")
+        try:
+            frame.get_by_role(
+                "button",
+                name="Already a member? Log In"
+            ).click(timeout=8000)
+            page.wait_for_timeout(2000)
+        except TimeoutError:
+            print("ℹ Already in login mode")
+        # -------------------------------------------------
+        # 5. SECOND LIGHTBOX → "Log in with Email"
+        # -------------------------------------------------
+        print("📧 Selecting Email login...")
+        frame.get_by_role(
+            "button",
+            name="Log in with Email"
+        ).wait_for(timeout=15000)
+        frame.get_by_role(
+            "button",
+            name="Log in with Email"
+        ).click()
+        page.wait_for_timeout(2000)
+        # -------------------------------------------------
+        # 6. FINAL FORM → Email + Password
+        # -------------------------------------------------
+        print("✍ Filling credentials...")
+        email_input = frame.locator('input[type="email"]')
+        email_input.wait_for(timeout=15000)
+        email_input.fill(email)
+        password_input = frame.locator('input[type="password"]')
+        password_input.wait_for(timeout=15000)
+        password_input.fill(password)
+        frame.get_by_role(
+            "button",
+            name="Log In"
+        ).click()
+        print("✅ Login submitted")
+        page.wait_for_timeout(5000)
+        return browser, context, page

kb_builder/auth/test_login.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from wix_login import login_to_wix
+EMAIL = os.getenv("WIX_EMAIL")
+browser, context, page, playwright = login_to_wix(EMAIL)
+input("🔒 Browser is open. Press ENTER to close...")
+browser.close()
+playwright.stop()

kb_builder/auth/test_login1.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from wix_login import login_to_wix
+EMAIL = "satishdevloper03@gmail.com"
+browser, context, page = login_to_wix(
+    EMAIL,
+    headless=False  # Keep False for testing to see the flow
+)
+print("🟢 Logged in successfully")
+input("Press ENTER to close browser...")
+browser.close()

kb_builder/auth/wix_login.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+from playwright.sync_api import sync_playwright, TimeoutError
+LOGIN_URL = "https://www.onlythetruth.in/learnastrology"
+def login_to_wix(email):
+    password = os.getenv("WIX_PASSWORD")
+    if not password:
+        raise RuntimeError("❌ WIX_PASSWORD not set")
+    # ❗ DO NOT use `with`
+    p = sync_playwright().start()
+    browser = p.chromium.launch(
+        headless=False,
+        slow_mo=100
+    )
+    context = browser.new_context()
+    page = context.new_page()
+    print("🌐 Opening site...")
+    page.goto(LOGIN_URL, wait_until="domcontentloaded")
+    page.wait_for_timeout(5000)
+    print("🔑 Clicking top-right Log In...")
+    login_text = page.locator('span:has-text("Log In")').first
+    login_text.wait_for(timeout=15000)
+    # JS click is mandatory for Wix
+    page.evaluate("(el) => el.click()", login_text.element_handle())
+    print("✅ Login click attempted")
+    page.wait_for_timeout(10000)  # KEEP BROWSER OPEN
+    return browser, context, page, p

kb_builder/auth/wix_login1.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import time
+from playwright.sync_api import sync_playwright, TimeoutError
+WIX_URL = "https://www.onlythetruth.in/learnastrology"
+def login_to_wix(email: str, headless: bool = False):
+    password = os.getenv("WIX_PASSWORD")
+    if not password:
+        raise RuntimeError("❌ WIX_PASSWORD env variable not set")
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=headless, slow_mo=100)
+        context = browser.new_context()
+        page = context.new_page()
+        print("🌐 Opening site...")
+        page.goto(WIX_URL, wait_until="domcontentloaded")
+        # Step 1: Click the top Login button
+        print("🔑 Clicking Login button...")
+        page.get_by_text("Log In", exact=False).first.click()
+        time.sleep(2)  # wait for first lightbox
+        # Step 2: Click the "Already a member? Log In" link if visible
+        try:
+            page.get_by_text("Already a member?", exact=False).click()
+            print("ℹ Switched to login mode")
+            time.sleep(1)
+        except TimeoutError:
+            print("ℹ Already in login mode – skipping switch")
+        # Step 3: Click "Log in with Email" button
+        try:
+            page.get_by_text("Log in with Email", exact=False).click()
+            print("ℹ Email login form displayed")
+            time.sleep(1)
+        except TimeoutError:
+            print("ℹ Email form already visible")
+        # Step 4: Fill email and password
+        print("✍ Filling credentials...")
+        page.locator('input[type="email"]').fill(email)
+        page.locator('input[type="password"]').fill(password)
+        # Step 5: Click final Log In button
+        print("🚀 Submitting login...")
+        page.get_by_role("button", name="Log In").click()
+        # Wait for login to complete
+        time.sleep(3)
+        print("✅ Login completed")
+        return browser, context, page
+if __name__ == "__main__":
+    EMAIL = os.getenv("WIX_EMAIL")
+    browser, context, page = login_to_wix(EMAIL)

kb_builder/chunker/semantic_chunker.py ADDED Viewed

	@@ -0,0 +1,16 @@

+def chunk_text(text, max_chars=1200):
+sentences = text.split(". ")
+chunks, current = [], ""
+for s in sentences:
+if len(current) + len(s) < max_chars:
+current += s + ". "
+else:
+chunks.append(current.strip())
+current = s + ". "
+if current:
+chunks.append(current.strip())
+return chunks

kb_builder/config.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+wix:
+login_url: "https://www.wix.com/login"
+email: "satishdevloper03@gmail.com"
+courses:
+- course_id: "vedic_astrology_lvl1"
+course_name: "Vedic Astrology Level 1"
+paid: false
+lesson_urls:
+- "https://yoursite.com/lesson-1"
+- "https://yoursite.com/lesson-2"
+- course_id: "vedic_astrology_lvl2"
+course_name: "Vedic Astrology Level 2"
+paid: true
+lesson_urls:
+- "https://yoursite.com/lesson-10"
+output:
+vector_store_dir: "vector_store"

kb_builder/embeddings/embedder.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from sentence_transformers import SentenceTransformer
+_model = SentenceTransformer("all-MiniLM-L6-v2")
+def embed_texts(texts):
+return _model.encode(texts, show_progress_bar=True)

kb_builder/index/faiss_builder.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import faiss
+import json
+import numpy as np
+def build_faiss_index(vectors, metadata, output_dir):
+dim = vectors.shape[1]
+index = faiss.IndexFlatL2(dim)
+index.add(vectors.astype("float32"))
+faiss.write_index(index, f"{output_dir}/index.faiss")
+with open(f"{output_dir}/metadata.json", "w", encoding="utf-8") as f:
+json.dump(metadata, f, indent=2, ensure_ascii=False)

kb_builder/ocr/image_ocr.py ADDED Viewed

File without changes

kb_builder/parser/html_parser.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from bs4 import BeautifulSoup
+def parse_lesson(html):
+soup = BeautifulSoup(html, "lxml")
+lesson_title = soup.find("h1").get_text(strip=True)
+sections = []
+for sec in soup.find_all(["section", "article"]):
+header = sec.find(["h2", "h3", "h4"])
+text = sec.get_text("\n", strip=True)
+if text:
+sections.append({
+"heading": header.get_text(strip=True) if header else "General",
+"text": text
+})
+return lesson_title, sections

kb_builder/run_pipeline.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import yaml
+import os
+from auth.wix_login import login_to_wix
+from scraper.lesson_scraper import fetch_lesson_html
+from parser.html_parser import parse_lesson
+from chunker.semantic_chunker import chunk_text
+from embeddings.embedder import embed_texts
+from index.faiss_builder import build_faiss_index
+cfg = yaml.safe_load(open("config.yaml"))
+browser, page = login_to_wix(
+cfg["wix"]["login_url"],
+cfg["wix"]["email"],
+)
+all_chunks = []
+metadata = []
+for course in cfg["courses"]:
+for url in course["lesson_urls"]:
+html = fetch_lesson_html(page, url)
+title, sections = parse_lesson(html)
+for sec in sections:
+chunks = chunk_text(sec["text"])
+for c in chunks:
+all_chunks.append(c)
+metadata.append({
+"course": course["course_name"],
+"lesson": title,
+"section": sec["heading"],
+"paid": course["paid"],
+"url": url
+})
+vectors = embed_texts(all_chunks)
+os.makedirs(cfg["output"]["vector_store_dir"], exist_ok=True)
+build_faiss_index(vectors, metadata, cfg["output"]["vector_store_dir"])
+browser.close()
+print("Knowledge base build completed")

kb_builder/scraper/lesson_scraper.py ADDED Viewed

	@@ -0,0 +1,4 @@

+def fetch_lesson_html(page, url):
+page.goto(url)
+page.wait_for_load_state("networkidle")
+return page.content()

rag.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+import pickle
+import faiss
+import numpy as np
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer
+from groq import Groq
+# ---------------- CONFIG ----------------
+INDEX_DIR = "index"
+TOP_K = 4
+# ---------------- LOAD ENV ----------------
+load_dotenv()
+api_key = os.getenv("GROQ_API_KEY")
+if not api_key:
+    raise ValueError("❌ GROQ_API_KEY not found in .env")
+client = Groq(api_key=api_key)
+# ---------------- LOAD INDEX ----------------
+index = faiss.read_index(f"{INDEX_DIR}/faiss.index")
+with open(f"{INDEX_DIR}/documents.pkl", "rb") as f:
+    documents = pickle.load(f)
+# ---------------- EMBEDDINGS ----------------
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# ---------------- RETRIEVAL ----------------
+def retrieve(query, top_k=TOP_K):
+    query_embedding = embedder.encode([query])
+    query_embedding = np.array(query_embedding).astype("float32")
+    distances, indices = index.search(query_embedding, top_k)
+    results = []
+    for idx in indices[0]:
+        if idx == -1:
+            continue
+        results.append(documents[idx])
+    return results
+# ---------------- LLM ----------------
+def ask_llm(query, contexts):
+    context_text = "\n\n".join(
+        f"[{c['source']} p.{c['page']}]\n{c['text']}"
+        for c in contexts
+    )
+    response = client.chat.completions.create(
+        model="llama-3.1-8b-instant",
+        messages=[
+            {
+                "role": "system",
+                "content": """
+You are an astrology tutor.
+Explain concepts clearly and practically.
+If a chart or diagram from the reference material is useful,
+explicitly say: "Refer to the diagram below."
+Otherwise, do not mention diagrams.
+"""
+            },
+            {
+                "role": "user",
+                "content": f"""
+Use the following reference material to answer the question.
+{context_text}
+Question: {query}
+"""
+            }
+        ],
+        temperature=0.2
+    )
+    return response.choices[0].message.content