Spaces:

menikev
/

KnowYourRIght-Bot

Sleeping

+import os
+from retriever import get_retriever
+from langchain.chains import RetrievalQA
+from transformers import pipeline
+from langchain_community.llms import HuggingFacePipeline
+from langchain_community.llms import HuggingFaceEndpoint
+from dotenv import load_dotenv
+load_dotenv()
+# Load retriever
+retriever = get_retriever()
+# Load Hugging Face LLM
+# Load the model pipeline
+pipe = pipeline(
+    "text-generation",
+    model="tiiuae/falcon-7b-instruct",
+    trust_remote_code=True,
+    device_map="auto",
+    max_new_tokens=512,
+    temperature=0.2
+)
+# Wrap in LangChain LLM
+llm = HuggingFacePipeline(pipeline=pipe)
+# Prompt templates
+english_prompt_template = """
+You are a helpful Nigerian legal assistant.
+Answer clearly in English, keeping the legal facts correct.
+After the answer, list the sources you used.
+Question: {question}
+Answer:
+"""
+pidgin_prompt_template = """
+You be legal assistant wey sabi Nigerian law well well.
+The user fit talk for English or Pidgin, but you go always answer for Nigerian Pidgin.
+No change the legal facts, but make am simple so person wey no study law fit understand.
+After you give the answer, put list of the sources wey you use.
+Question: {question}
+Answer for Nigerian Pidgin:
+"""
+# Create QA chain
+qa_chain = RetrievalQA.from_chain_type(
+    llm=llm,
+    retriever=retriever,
+    chain_type="stuff",
+    return_source_documents=True
+)
+def chat():
+    print("📜 KnowYourRight Bot")
+    print("Type 'exit' to stop.\n")
+    # Ask language mode
+    while True:
+        lang_choice = input("Choose mode: [1] English  [2] Pidgin: ").strip()
+        if lang_choice in ["1", "2"]:
+            break
+        print("❌ Invalid choice. Please type 1 or 2.")
+    pidgin_mode = lang_choice == "2"
+    # Start chat loop
+    while True:
+        query = input("\nYou: ")
+        if query.lower() in ["exit", "quit"]:
+            break
+        # Pick prompt based on mode
+        if pidgin_mode:
+            formatted_query = pidgin_prompt_template.format(question=query)
+        else:
+            formatted_query = english_prompt_template.format(question=query)
+        result = qa_chain.invoke({"query": formatted_query})
+        # Print answer
+        print("\nBot:", result["result"])
+        # Print sources
+        print("\n📚 Sources:")
+        for doc in result["source_documents"]:
+            print("-", doc.metadata.get("source", "Unknown"))
+        print("\n" + "-"*50)
+if __name__ == "__main__":
+    chat()

src/api.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/embeddings.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/evaluation.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/ingest_documents.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+PDF Ingestion Pipeline for KnowYourRight Bot
+- Loads PDFs from /data/raw
+- Checks if pages are scanned or text-based
+- Runs OCR when needed
+- Splits into chunks for embedding
+- Generates embeddings using open-source models
+- Saves into ChromaDB vector store
+"""
+import os
+import sys
+import fitz  # PyMuPDF
+import pytesseract
+from PIL import Image
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.docstore.document import Document
+from dotenv import load_dotenv
+from huggingface_hub import login
+# Load environment variables from .env file
+load_dotenv()
+# Get token from env
+hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
+if not hf_token:
+    print("[ERROR] Missing Hugging Face token. Add it to .env as HUGGINGFACE_HUB_TOKEN")
+    sys.exit(1)
+# Login to Hugging Face
+login(token=hf_token)
+# Paths
+RAW_DATA_DIR = "data/raw"
+PROCESSED_DATA_DIR = "data/processed"
+VECTOR_DB_DIR = "vector_db"
+os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
+os.makedirs(VECTOR_DB_DIR, exist_ok=True)
+# Detect Tesseract path (Windows vs Linux)
+if os.name == "nt":  # Windows
+    default_tess_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+    if not os.path.exists(default_tess_path):
+        print("[ERROR] Tesseract not found. Install from: https://github.com/UB-Mannheim/tesseract/wiki")
+        sys.exit(1)
+    pytesseract.pytesseract.tesseract_cmd = default_tess_path
+else:  # Linux/Mac
+    pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"
+def is_scanned_page(page):
+    """Check if PDF page contains text or is image-based."""
+    text = page.get_text().strip()
+    return len(text) == 0
+def extract_text_from_pdf(pdf_path):
+    """Extract text from PDF with OCR for scanned pages."""
+    doc = fitz.open(pdf_path)
+    all_text = []
+    for page_num, page in enumerate(doc):
+        if is_scanned_page(page):
+            pix = page.get_pixmap(dpi=300)
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            text = pytesseract.image_to_string(img)
+            print(f"[OCR] Page {page_num + 1}: {len(text.strip())} chars extracted")
+        else:
+            text = page.get_text()
+            print(f"[TEXT] Page {page_num + 1}: {len(text.strip())} chars extracted")
+        if text.strip():
+            all_text.append(text)
+    return "\n".join(all_text)
+def save_clean_text(filename, text):
+    """Save extracted text to processed folder."""
+    clean_path = os.path.join(PROCESSED_DATA_DIR, filename.replace(".pdf", ".txt"))
+    with open(clean_path, "w", encoding="utf-8") as f:
+        f.write(text)
+    return clean_path
+def chunk_text(file_path):
+    """Split text into overlapping chunks."""
+    with open(file_path, "r", encoding="utf-8") as f:
+        text = f.read()
+    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
+    chunks = splitter.split_text(text)
+    print(f"[CHUNKS] {file_path}: {len(chunks)} chunks created")
+    docs = [Document(page_content=chunk, metadata={"source": file_path}) for chunk in chunks]
+    return docs
+def embed_and_store(documents):
+    """Generate embeddings and store in Chroma vector DB."""
+    if not documents:
+        print("[ERROR] No documents to embed. Exiting.")
+        sys.exit(1)
+    embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
+    # Test embedding
+    test_vec = embedding_model.embed_query("Hello world")
+    if not test_vec or all(v == 0 for v in test_vec):
+        print("[ERROR] Embedding model returned empty vectors. Check Hugging Face token or model access.")
+        sys.exit(1)
+    vectordb = Chroma.from_documents(documents, embedding_model, persist_directory=VECTOR_DB_DIR)
+    vectordb.persist()
+    print(f"[OK] Stored {len(documents)} chunks in vector DB at {VECTOR_DB_DIR}")
+def main():
+    all_docs = []
+    for filename in os.listdir(RAW_DATA_DIR):
+        if filename.endswith(".pdf"):
+            pdf_path = os.path.join(RAW_DATA_DIR, filename)
+            print(f"[LOAD] Processing {filename}...")
+            text = extract_text_from_pdf(pdf_path)
+            if not text.strip():
+                print(f"[WARNING] No text extracted from {filename}, skipping...")
+                continue
+            clean_path = save_clean_text(filename, text)
+            docs = chunk_text(clean_path)
+            all_docs.extend(docs)
+    embed_and_store(all_docs)
+    print("[DONE] All documents processed and stored.")
+if __name__ == "__main__":
+    main()

src/retriever.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+VECTOR_DB_DIR = "vector_db"
+def get_retriever():
+    embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
+    vectordb = Chroma(persist_directory=VECTOR_DB_DIR, embedding_function=embedding_model)
+    return vectordb.as_retriever(search_kwargs={"k": 3})