import os import gradio as gr import fitz # PyMuPDF from typing import List from groq import Groq from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document from langchain.chains import RetrievalQA from langchain.llms.base import LLM # === Groq Client Setup === client = Groq(api_key=os.environ.get("GROQ_API_KEY")) # === Custom LLM Wrapper === class GroqLLM(LLM): model: str = "llama3-70b-8192" def __init__(self, model: str = None): super().__init__() if model: self.model = model def _call(self, prompt: str, stop: List[str] = None) -> str: try: response = client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content.strip() except Exception as e: return f"[Groq API Error] {str(e)}" @property def _llm_type(self) -> str: return "groq_llm" # === Input Extraction === def extract_text(file=None, clipboard=None): try: if file: doc = fitz.open(file.name) return " ".join(page.get_text() for page in doc) elif clipboard: return clipboard except Exception as e: return f"[Extract Error] {str(e)}" return "" # === Preprocessing & Vector Store Setup === def process_text(input_text): splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = splitter.split_text(input_text) docs = [Document(page_content=t) for t in texts] embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") db = FAISS.from_documents(docs, embeddings) retriever = db.as_retriever() qa_chain = RetrievalQA.from_chain_type( llm=GroqLLM(), retriever=retriever, return_source_documents=True ) return qa_chain # === Core RAG Handler === def handle_input(file, clipboard, query): try: raw_text = extract_text(file, clipboard) if not raw_text: return "Please provide either a PDF or clipboard text." # Your prompt for explanation + risks default_query = ( "Explain this policy in simple terms and highlight the risks for the user. " "Provide bullet points for risks." ) user_query = query if query else default_query qa = process_text(raw_text) response = qa.invoke({"query": user_query}) result = response["result"] sources = response["source_documents"] source_preview = "\n\nšŸ“„ Sources:\n" + "\n---\n".join( [doc.page_content[:300] + "..." for doc in sources[:3]] ) return result + source_preview except Exception as e: return f"āŒ Error: {str(e)}" # === Gradio UI === with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# šŸ¤– Lexicon: Your Policy Explainer Bot") with gr.Row(): file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) clipboard_input = gr.Textbox(label="Or Paste Text", placeholder="Paste policy text here", lines=10) query_input = gr.Textbox(label="Ask a Question (optional)", placeholder="e.g., What risks am I agreeing to?") submit_btn = gr.Button("šŸ” Analyze") output = gr.Textbox(label="Output", lines=20) submit_btn.click(fn=handle_input, inputs=[file_input, clipboard_input, query_input], outputs=output) demo.launch()