teamb / pipeline.py
Rakib023's picture
Create pipeline.py
0bad9bf verified
import os, re
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from config import GEMINI_MODEL, EMBED_MODEL
# Initialize models
embedding_model = GoogleGenerativeAIEmbeddings(model=EMBED_MODEL)
llm = ChatGoogleGenerativeAI(model=GEMINI_MODEL, temperature=0)
# Load and process PDFs
def process_pdf(file_path):
loader = PyPDFLoader(file_path)
pages = loader.load()
for p in pages:
p.metadata["source"] = os.path.basename(file_path)
return pages
def enrich_metadata(docs):
for doc in docs:
src = doc.metadata.get("source", "").lower()
if "ict" in src:
doc.metadata.update({"law_name": "ICT Act", "year": 2006})
elif "labour" in src:
doc.metadata.update({"law_name": "Labour Act", "year": 2018})
elif "penal" in src:
doc.metadata.update({"law_name": "Penal Code", "year": 1860})
elif "constitution" in src:
doc.metadata.update({"law_name": "Constitution", "year": 1972})
return docs
def semantic_split(docs):
section_chunks = []
pattern = re.compile(r"(Section\s\d+\.?\d*|Article\s\d+\.?\d*|Chapter\s\d+\.?\d*)", re.IGNORECASE)
for doc in docs:
text = doc.page_content or ""
splits = pattern.split(text)
for i in range(1, len(splits), 2):
heading = splits[i].strip()
body = splits[i+1].strip() if i+1 < len(splits) else ""
chunk_text = f"{heading}\n{body}"
meta = doc.metadata.copy()
meta.update({"section_heading": heading})
section_chunks.append(Document(page_content=chunk_text, metadata=meta))
return section_chunks
# Build vector DB
def build_vector_db(documents, persist_dir="chroma_db_laws"):
vectorstore = Chroma.from_documents(
documents=documents,
embedding=embedding_model,
persist_directory=persist_dir
)
vectorstore.persist()
return vectorstore
# Load existing vector DB
def load_vector_db(persist_dir="chroma_db_laws"):
return Chroma(persist_directory=persist_dir, embedding_function=embedding_model)
# Create retriever
def get_qa_chain(vectorstore):
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
return RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
return_source_documents=True,
chain_type="stuff"
)