menikev commited on
Commit
84fb4ed
·
verified ·
1 Parent(s): ce89480

Upload 8 files

Browse files
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/__pycache__/retriever.cpython-310.pyc ADDED
Binary file (676 Bytes). View file
 
src/agent.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from retriever import get_retriever
3
+ from langchain.chains import RetrievalQA
4
+ from transformers import pipeline
5
+ from langchain_community.llms import HuggingFacePipeline
6
+ from langchain_community.llms import HuggingFaceEndpoint
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ # Load retriever
12
+ retriever = get_retriever()
13
+
14
+ # Load Hugging Face LLM
15
+ # Load the model pipeline
16
+ pipe = pipeline(
17
+ "text-generation",
18
+ model="tiiuae/falcon-7b-instruct",
19
+ trust_remote_code=True,
20
+ device_map="auto",
21
+ max_new_tokens=512,
22
+ temperature=0.2
23
+ )
24
+
25
+ # Wrap in LangChain LLM
26
+ llm = HuggingFacePipeline(pipeline=pipe)
27
+
28
+ # Prompt templates
29
+ english_prompt_template = """
30
+ You are a helpful Nigerian legal assistant.
31
+ Answer clearly in English, keeping the legal facts correct.
32
+ After the answer, list the sources you used.
33
+
34
+ Question: {question}
35
+ Answer:
36
+ """
37
+
38
+ pidgin_prompt_template = """
39
+ You be legal assistant wey sabi Nigerian law well well.
40
+ The user fit talk for English or Pidgin, but you go always answer for Nigerian Pidgin.
41
+ No change the legal facts, but make am simple so person wey no study law fit understand.
42
+ After you give the answer, put list of the sources wey you use.
43
+
44
+ Question: {question}
45
+ Answer for Nigerian Pidgin:
46
+ """
47
+
48
+ # Create QA chain
49
+ qa_chain = RetrievalQA.from_chain_type(
50
+ llm=llm,
51
+ retriever=retriever,
52
+ chain_type="stuff",
53
+ return_source_documents=True
54
+ )
55
+
56
+ def chat():
57
+ print("📜 KnowYourRight Bot")
58
+ print("Type 'exit' to stop.\n")
59
+
60
+ # Ask language mode
61
+ while True:
62
+ lang_choice = input("Choose mode: [1] English [2] Pidgin: ").strip()
63
+ if lang_choice in ["1", "2"]:
64
+ break
65
+ print("❌ Invalid choice. Please type 1 or 2.")
66
+
67
+ pidgin_mode = lang_choice == "2"
68
+
69
+ # Start chat loop
70
+ while True:
71
+ query = input("\nYou: ")
72
+ if query.lower() in ["exit", "quit"]:
73
+ break
74
+
75
+ # Pick prompt based on mode
76
+ if pidgin_mode:
77
+ formatted_query = pidgin_prompt_template.format(question=query)
78
+ else:
79
+ formatted_query = english_prompt_template.format(question=query)
80
+
81
+ result = qa_chain.invoke({"query": formatted_query})
82
+
83
+ # Print answer
84
+ print("\nBot:", result["result"])
85
+
86
+ # Print sources
87
+ print("\n📚 Sources:")
88
+ for doc in result["source_documents"]:
89
+ print("-", doc.metadata.get("source", "Unknown"))
90
+ print("\n" + "-"*50)
91
+
92
+ if __name__ == "__main__":
93
+ chat()
src/api.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/embeddings.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/evaluation.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/ingest_documents.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Ingestion Pipeline for KnowYourRight Bot
3
+ - Loads PDFs from /data/raw
4
+ - Checks if pages are scanned or text-based
5
+ - Runs OCR when needed
6
+ - Splits into chunks for embedding
7
+ - Generates embeddings using open-source models
8
+ - Saves into ChromaDB vector store
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import fitz # PyMuPDF
14
+ import pytesseract
15
+ from PIL import Image
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain_community.embeddings import HuggingFaceEmbeddings
18
+ from langchain_community.vectorstores import Chroma
19
+ from langchain.docstore.document import Document
20
+ from dotenv import load_dotenv
21
+ from huggingface_hub import login
22
+
23
+ # Load environment variables from .env file
24
+ load_dotenv()
25
+
26
+ # Get token from env
27
+ hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
28
+ if not hf_token:
29
+ print("[ERROR] Missing Hugging Face token. Add it to .env as HUGGINGFACE_HUB_TOKEN")
30
+ sys.exit(1)
31
+
32
+ # Login to Hugging Face
33
+ login(token=hf_token)
34
+
35
+ # Paths
36
+ RAW_DATA_DIR = "data/raw"
37
+ PROCESSED_DATA_DIR = "data/processed"
38
+ VECTOR_DB_DIR = "vector_db"
39
+
40
+ os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
41
+ os.makedirs(VECTOR_DB_DIR, exist_ok=True)
42
+
43
+ # Detect Tesseract path (Windows vs Linux)
44
+ if os.name == "nt": # Windows
45
+ default_tess_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
46
+ if not os.path.exists(default_tess_path):
47
+ print("[ERROR] Tesseract not found. Install from: https://github.com/UB-Mannheim/tesseract/wiki")
48
+ sys.exit(1)
49
+ pytesseract.pytesseract.tesseract_cmd = default_tess_path
50
+ else: # Linux/Mac
51
+ pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"
52
+
53
+ def is_scanned_page(page):
54
+ """Check if PDF page contains text or is image-based."""
55
+ text = page.get_text().strip()
56
+ return len(text) == 0
57
+
58
+ def extract_text_from_pdf(pdf_path):
59
+ """Extract text from PDF with OCR for scanned pages."""
60
+ doc = fitz.open(pdf_path)
61
+ all_text = []
62
+ for page_num, page in enumerate(doc):
63
+ if is_scanned_page(page):
64
+ pix = page.get_pixmap(dpi=300)
65
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
66
+ text = pytesseract.image_to_string(img)
67
+ print(f"[OCR] Page {page_num + 1}: {len(text.strip())} chars extracted")
68
+ else:
69
+ text = page.get_text()
70
+ print(f"[TEXT] Page {page_num + 1}: {len(text.strip())} chars extracted")
71
+ if text.strip():
72
+ all_text.append(text)
73
+ return "\n".join(all_text)
74
+
75
+ def save_clean_text(filename, text):
76
+ """Save extracted text to processed folder."""
77
+ clean_path = os.path.join(PROCESSED_DATA_DIR, filename.replace(".pdf", ".txt"))
78
+ with open(clean_path, "w", encoding="utf-8") as f:
79
+ f.write(text)
80
+ return clean_path
81
+
82
+ def chunk_text(file_path):
83
+ """Split text into overlapping chunks."""
84
+ with open(file_path, "r", encoding="utf-8") as f:
85
+ text = f.read()
86
+ splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
87
+ chunks = splitter.split_text(text)
88
+ print(f"[CHUNKS] {file_path}: {len(chunks)} chunks created")
89
+ docs = [Document(page_content=chunk, metadata={"source": file_path}) for chunk in chunks]
90
+ return docs
91
+
92
+ def embed_and_store(documents):
93
+ """Generate embeddings and store in Chroma vector DB."""
94
+ if not documents:
95
+ print("[ERROR] No documents to embed. Exiting.")
96
+ sys.exit(1)
97
+
98
+ embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
99
+
100
+ # Test embedding
101
+ test_vec = embedding_model.embed_query("Hello world")
102
+ if not test_vec or all(v == 0 for v in test_vec):
103
+ print("[ERROR] Embedding model returned empty vectors. Check Hugging Face token or model access.")
104
+ sys.exit(1)
105
+
106
+ vectordb = Chroma.from_documents(documents, embedding_model, persist_directory=VECTOR_DB_DIR)
107
+ vectordb.persist()
108
+ print(f"[OK] Stored {len(documents)} chunks in vector DB at {VECTOR_DB_DIR}")
109
+
110
+ def main():
111
+ all_docs = []
112
+ for filename in os.listdir(RAW_DATA_DIR):
113
+ if filename.endswith(".pdf"):
114
+ pdf_path = os.path.join(RAW_DATA_DIR, filename)
115
+ print(f"[LOAD] Processing {filename}...")
116
+ text = extract_text_from_pdf(pdf_path)
117
+
118
+ if not text.strip():
119
+ print(f"[WARNING] No text extracted from {filename}, skipping...")
120
+ continue
121
+
122
+ clean_path = save_clean_text(filename, text)
123
+ docs = chunk_text(clean_path)
124
+ all_docs.extend(docs)
125
+
126
+ embed_and_store(all_docs)
127
+ print("[DONE] All documents processed and stored.")
128
+
129
+ if __name__ == "__main__":
130
+ main()
src/retriever.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import Chroma
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+
4
+ VECTOR_DB_DIR = "vector_db"
5
+
6
+ def get_retriever():
7
+ embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
8
+ vectordb = Chroma(persist_directory=VECTOR_DB_DIR, embedding_function=embedding_model)
9
+ return vectordb.as_retriever(search_kwargs={"k": 3})
10
+