Rakib023 commited on
Commit
45fb094
Β·
verified Β·
1 Parent(s): e15b2d2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ from rag_pipeline import load_and_process_documents, ask_question
4
+
5
+ st.set_page_config(page_title="Bangladesh Law QA", layout="wide")
6
+ st.title("πŸ“š Bangladesh Law RAG QA System")
7
+ st.markdown("Ask legal questions based on the Constitution, ICT Act, Labour Law, and more.")
8
+
9
+ # Load and process PDFs
10
+ @st.cache_resource
11
+ def setup():
12
+ pdfs = [
13
+ "./pdfs/Bangladesh-ICT-Act-2006.pdf",
14
+ "./pdfs/Bangladesh-Labour-Act-2006_English-Upto-2018.pdf",
15
+ "./pdfs/bangladesh_rti_act_2009_summary.pdf",
16
+ "./pdfs/bgd-gbv-19-03-law-1860-eng-the-penal-code-1860.pdf",
17
+ "./pdfs/constitution.pdf",
18
+ "./pdfs/gazette.pdf",
19
+ "./pdfs/unicef.pdf",
20
+ ]
21
+ return load_and_process_documents(pdfs)
22
+
23
+ chunks, retriever, qa_chain = setup()
24
+
25
+ query = st.text_input("πŸ” Enter your legal question")
26
+ law_options = ["All", "ICT Act", "Labour Act", "Penal Code", "Constitution"]
27
+ law_filter = st.selectbox("πŸ“˜ Filter by Law (optional)", law_options)
28
+ if law_filter == "All": law_filter = None
29
+
30
+ if query:
31
+ with st.spinner("Answering..."):
32
+ answer, sources = ask_question(query, retriever, qa_chain, law_filter)
33
+ st.success(answer)
34
+ with st.expander("πŸ“Ž Source Documents"):
35
+ for doc in sources:
36
+ st.markdown(f"**{doc.metadata.get('law_name', '')} - {doc.metadata.get('section_heading', '')}**")
37
+ st.text(doc.page_content[:500])
38
+
39
+ # BONUS: Predefined sample questions
40
+ st.markdown("---")
41
+ st.markdown("### πŸ§ͺ Try Sample Legal Questions:")
42
+ sample_questions = [
43
+ ("What does the Constitution say about freedom of expression?", "Constitution"),
44
+ ("Under ICT Act, is cyberbullying a crime?", "ICT Act"),
45
+ ("How many hours can a laborer work in a day?", "Labour Act"),
46
+ ("What are the punishments under the Digital Security Act for hacking?", "ICT Act"),
47
+ ("Is digital evidence allowed in court?", "ICT Act"),
48
+ ]
49
+
50
+ for q, lf in sample_questions:
51
+ if st.button(f"▢️ {q}"):
52
+ with st.spinner("Running..."):
53
+ answer, sources = ask_question(q, retriever, qa_chain, law_filter=lf)
54
+ st.success(answer)
55
+ with st.expander("πŸ“Ž Source Documents"):
56
+ for doc in sources:
57
+ st.markdown(f"**{doc.metadata.get('law_name', '')} - {doc.metadata.get('section_heading', '')}**")
58
+ st.text(doc.page_content[:500])
59
+
60
+
61
+ # rag_pipeline.py
62
+ import os, re
63
+ from langchain_community.document_loaders import PyPDFLoader
64
+ from langchain.schema import Document
65
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
66
+ from langchain.vectorstores import Chroma
67
+ from langchain.chains import RetrievalQA
68
+
69
+ def load_and_process_documents(pdf_paths):
70
+ all_docs = []
71
+ for path in pdf_paths:
72
+ loader = PyPDFLoader(path)
73
+ pages = loader.load()
74
+ for p in pages:
75
+ p.metadata["source"] = os.path.basename(path)
76
+ all_docs.extend(pages)
77
+
78
+ # Add metadata
79
+ for doc in all_docs:
80
+ src = doc.metadata.get("source", "").lower()
81
+ if "ict" in src:
82
+ doc.metadata.update({"law_name": "ICT Act", "year": 2006, "law_type": "ICT"})
83
+ elif "labour" in src:
84
+ doc.metadata.update({"law_name": "Labour Act", "year": 2018, "law_type": "Labour"})
85
+ elif "penal" in src:
86
+ doc.metadata.update({"law_name": "Penal Code", "year": 1860, "law_type": "Criminal"})
87
+ elif "constitution" in src:
88
+ doc.metadata.update({"law_name": "Constitution", "year": 1972, "law_type": "Constitutional"})
89
+
90
+ # Section splitting
91
+ section_pattern = re.compile(r"(Section\\s\\d+\\.?\\d*|Article\\s\\d+\\.?\\d*|Chapter\\s\\d+\\.?\\d*)", re.IGNORECASE)
92
+ section_chunks = []
93
+ for doc in all_docs:
94
+ text = doc.page_content or ""
95
+ splits = section_pattern.split(text)
96
+ for i in range(1, len(splits), 2):
97
+ heading = splits[i].strip()
98
+ body = splits[i+1].strip() if i+1 < len(splits) else ""
99
+ chunk_text = f"{heading}\n{body}"
100
+ meta = doc.metadata.copy()
101
+ meta.update({"section_heading": heading})
102
+ section_chunks.append(Document(page_content=chunk_text, metadata=meta))
103
+
104
+ # Embedding + Vector store
105
+ embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
106
+ vectorstore = Chroma.from_documents(section_chunks, embedding=embedding, persist_directory="./chroma_db")
107
+ vectorstore.persist()
108
+
109
+ retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
110
+ llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0)
111
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True, chain_type="stuff")
112
+
113
+ return section_chunks, retriever, qa_chain
114
+
115
+ def ask_question(query, retriever, qa_chain, law_filter=None, year_filter=None):
116
+ docs = retriever.get_relevant_documents(query)
117
+ if law_filter:
118
+ docs = [d for d in docs if d.metadata.get("law_name") == law_filter]
119
+ if year_filter:
120
+ docs = [d for d in docs if d.metadata.get("year") == year_filter]
121
+
122
+ if not docs:
123
+ return "No relevant information found.", []
124
+
125
+ result = qa_chain({"input_documents": docs, "query": query})
126
+ return result["result"], result["source_documents"]