import streamlit as st import fitz # PyMuPDF import os from sentence_transformers import SentenceTransformer import faiss from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.docstore.document import Document from langchain.chains import RetrievalQA from langchain.llms import Groq from dotenv import load_dotenv load_dotenv() # CONFIG EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" GROQ_API_KEY = os.getenv("GROQ_API_KEY") GROQ_MODEL_NAME = "mixtral-8x7b-32768" st.set_page_config(page_title="Lexicon - Policy Explainer", layout="centered") st.markdown("

📜 Lexicon: Policy Explainer Bot

", unsafe_allow_html=True) st.markdown("This app explains, summarizes, and highlights risks in large policy or T&C documents.") uploaded_file = st.file_uploader("Upload PDF", type=["pdf"]) clipboard_input = st.text_area("Or paste policy text here", height=200) if uploaded_file or clipboard_input: with st.spinner("Processing document..."): # Step 1: Extract Text def extract_text_from_pdf(file): doc = fitz.open(stream=file.read(), filetype="pdf") return " ".join(page.get_text() for page in doc) raw_text = extract_text_from_pdf(uploaded_file) if uploaded_file else clipboard_input # Step 2: Split and Embed sentences = raw_text.split(". ") model = SentenceTransformer(EMBEDDING_MODEL_NAME) embeddings = model.encode(sentences) # Step 3: Create Vector Store dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) retriever = FAISS(embedding_function=lambda x: model.encode(x), index=index) documents = [Document(page_content=s) for s in sentences] retriever = FAISS.from_documents(documents, HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)).as_retriever() # Step 4: LLM RAG llm = Groq(api_key=GROQ_API_KEY, model=GROQ_MODEL_NAME) qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) st.success("Document processed! You can now ask questions.") query = st.text_input("Ask a question about the document:") if query: with st.spinner("Generating answer..."): result = qa_chain.run(query) st.markdown("### 🧠 Answer:") st.markdown(result) if st.button("Suggest key risks"): risk_prompt = "List any risks or obligations a user should be aware of from this document." with st.spinner("Identifying risks..."): risk_result = qa_chain.run(risk_prompt) st.markdown("### ⚠️ Risks & Concerns:") st.markdown(risk_result)