Lexicon_Chatbot / app.py
Harishkhawaja's picture
Update app.py
15346ff verified
import os
import gradio as gr
import fitz # PyMuPDF
from typing import List
from groq import Groq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM
# === Groq Client Setup ===
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# === Custom LLM Wrapper ===
class GroqLLM(LLM):
model: str = "llama3-70b-8192"
def __init__(self, model: str = None):
super().__init__()
if model:
self.model = model
def _call(self, prompt: str, stop: List[str] = None) -> str:
try:
response = client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"[Groq API Error] {str(e)}"
@property
def _llm_type(self) -> str:
return "groq_llm"
# === Input Extraction ===
def extract_text(file=None, clipboard=None):
try:
if file:
doc = fitz.open(file.name)
return " ".join(page.get_text() for page in doc)
elif clipboard:
return clipboard
except Exception as e:
return f"[Extract Error] {str(e)}"
return ""
# === Preprocessing & Vector Store Setup ===
def process_text(input_text):
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = splitter.split_text(input_text)
docs = [Document(page_content=t) for t in texts]
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embeddings)
retriever = db.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
llm=GroqLLM(), retriever=retriever, return_source_documents=True
)
return qa_chain
# === Core RAG Handler ===
def handle_input(file, clipboard, query):
try:
raw_text = extract_text(file, clipboard)
if not raw_text:
return "Please provide either a PDF or clipboard text."
# Your prompt for explanation + risks
default_query = (
"Explain this policy in simple terms and highlight the risks for the user. "
"Provide bullet points for risks."
)
user_query = query if query else default_query
qa = process_text(raw_text)
response = qa.invoke({"query": user_query})
result = response["result"]
sources = response["source_documents"]
source_preview = "\n\nπŸ“„ Sources:\n" + "\n---\n".join(
[doc.page_content[:300] + "..." for doc in sources[:3]]
)
return result + source_preview
except Exception as e:
return f"❌ Error: {str(e)}"
# === Gradio UI ===
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸ€– Lexicon: Your Policy Explainer Bot")
with gr.Row():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
clipboard_input = gr.Textbox(label="Or Paste Text", placeholder="Paste policy text here", lines=10)
query_input = gr.Textbox(label="Ask a Question (optional)", placeholder="e.g., What risks am I agreeing to?")
submit_btn = gr.Button("πŸ” Analyze")
output = gr.Textbox(label="Output", lines=20)
submit_btn.click(fn=handle_input, inputs=[file_input, clipboard_input, query_input], outputs=output)
demo.launch()