| import os, re |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.schema import Document |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI |
| from langchain.vectorstores import Chroma |
| from langchain.chains import RetrievalQA |
| from config import GEMINI_MODEL, EMBED_MODEL |
|
|
| |
| embedding_model = GoogleGenerativeAIEmbeddings(model=EMBED_MODEL) |
| llm = ChatGoogleGenerativeAI(model=GEMINI_MODEL, temperature=0) |
|
|
| |
| def process_pdf(file_path): |
| loader = PyPDFLoader(file_path) |
| pages = loader.load() |
| for p in pages: |
| p.metadata["source"] = os.path.basename(file_path) |
| return pages |
|
|
| def enrich_metadata(docs): |
| for doc in docs: |
| src = doc.metadata.get("source", "").lower() |
| if "ict" in src: |
| doc.metadata.update({"law_name": "ICT Act", "year": 2006}) |
| elif "labour" in src: |
| doc.metadata.update({"law_name": "Labour Act", "year": 2018}) |
| elif "penal" in src: |
| doc.metadata.update({"law_name": "Penal Code", "year": 1860}) |
| elif "constitution" in src: |
| doc.metadata.update({"law_name": "Constitution", "year": 1972}) |
| return docs |
|
|
| def semantic_split(docs): |
| section_chunks = [] |
| pattern = re.compile(r"(Section\s\d+\.?\d*|Article\s\d+\.?\d*|Chapter\s\d+\.?\d*)", re.IGNORECASE) |
| for doc in docs: |
| text = doc.page_content or "" |
| splits = pattern.split(text) |
| for i in range(1, len(splits), 2): |
| heading = splits[i].strip() |
| body = splits[i+1].strip() if i+1 < len(splits) else "" |
| chunk_text = f"{heading}\n{body}" |
| meta = doc.metadata.copy() |
| meta.update({"section_heading": heading}) |
| section_chunks.append(Document(page_content=chunk_text, metadata=meta)) |
| return section_chunks |
|
|
| |
| def build_vector_db(documents, persist_dir="chroma_db_laws"): |
| vectorstore = Chroma.from_documents( |
| documents=documents, |
| embedding=embedding_model, |
| persist_directory=persist_dir |
| ) |
| vectorstore.persist() |
| return vectorstore |
|
|
| |
| def load_vector_db(persist_dir="chroma_db_laws"): |
| return Chroma(persist_directory=persist_dir, embedding_function=embedding_model) |
|
|
| |
| def get_qa_chain(vectorstore): |
| retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}) |
| return RetrievalQA.from_chain_type( |
| llm=llm, |
| retriever=retriever, |
| return_source_documents=True, |
| chain_type="stuff" |
| ) |
|
|