import streamlit as st from groq import Groq import io import base64 import re import os from dotenv import load_dotenv from llama_index.core import VectorStoreIndex, Settings, Document from llama_index.readers.file import PDFReader from llama_index.llms.groq import Groq as LlamaGroq from llama_index.embeddings.langchain import LangchainEmbedding from langchain_community.embeddings import HuggingFaceEmbeddings from datetime import datetime from PIL import Image import gettext # Load environment variables and configure load_dotenv() groq_api_key = os.getenv("GROQ_API_KEY") client = Groq(api_key=groq_api_key) # Configure LlamaIndex Settings.llm = LlamaGroq(api_key=groq_api_key, model="llama-3.1-70b-versatile") lc_embed_model = HuggingFaceEmbeddings( model_name="sentence-transformers/all-mpnet-base-v2" ) Settings.embed_model = LangchainEmbedding(lc_embed_model) def initialize_session_state(): """Initialize all session state variables""" if 'chat_engines' not in st.session_state: st.session_state.chat_engines = {} if 'analyses' not in st.session_state: st.session_state.analyses = {} if 'documents' not in st.session_state: st.session_state.documents = {} if 'current_doc' not in st.session_state: st.session_state.current_doc = None if 'messages' not in st.session_state: st.session_state.messages = [] if 'document_history' not in st.session_state: st.session_state.document_history = {} def encode_image_to_base64(image): """Convert PIL Image to base64 string""" buffered = io.BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode() def process_image(image): """Process image using Llama vision model""" img_base64 = encode_image_to_base64(image) img_url = f"data:image/jpeg;base64,{img_base64}" completion = client.chat.completions.create( model="llama-3.2-11b-vision-preview", messages=[ { "role": "user", "content": [ { "type": "text", "text": """Please analyze this government document and provide: 1. Document type and purpose 2. Key requirements and deadlines 3. Complex terms explained simply 4. Required actions or next steps 5. Important contact information or submission details""" }, { "type": "image_url", "image_url": { "url": img_url } } ] } ], temperature=0.1, max_tokens=1024, top_p=1, stream=False ) return completion.choices[0].message.content def generate_pdf_analysis(documents): """Generate analysis from PDF documents using Groq""" try: # Combine all document content full_text = "\n".join([doc.text for doc in documents]) # Generate analysis using Groq completion = client.chat.completions.create( model="llama-3.1-70b-versatile", messages=[ { "role": "user", "content": ( "Please analyze this government document and provide:\n" "1. Document Type and Purpose:\n" " - What kind of document is this?\n" " - What is its main purpose?\n\n" "2. Key Requirements:\n" " - What are the main requirements or conditions?\n" " - What documents or information are needed?\n\n" "3. Important Deadlines:\n" " - What are the key dates and deadlines?\n" " - Are there any time-sensitive requirements?\n\n" "4. Complex Terms Explained:\n" " - Explain any technical or legal terms in simple language\n" " - Clarify any complex procedures\n\n" "5. Required Actions:\n" " - What steps need to be taken?\n" " - What is the process to follow?\n\n" "6. Contact Information:\n" " - Who to contact for queries?\n" " - Where to submit the documents?\n\n" "Document content:\n" + full_text ) } ], temperature=0.1, max_tokens=2048, top_p=1 ) # Format the analysis with proper styling analysis = completion.choices[0].message.content completionsum = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[ { "role": "user", "content": ( "Summarize the following content: " + analysis ) } ], temperature=0.1, max_tokens=2048, top_p=1 ) analysissum = completionsum.choices[0].message.content return analysissum except Exception as e: error_msg = "Error generating PDF analysis: " + str(e) raise Exception(error_msg) def clean_llm_output(output): """Clean LLM output by removing HTML tags and formatting symbols""" # Remove HTML tags cleaned_text = re.sub(r'<[^>]+>', '', output) # Remove double asterisks cleaned_text = cleaned_text.replace('**', '') cleaned_text = cleaned_text.replace('*', '') # Remove extra whitespace cleaned_text = re.sub(r'\s+', ' ', cleaned_text) return cleaned_text.strip() def format_analysis_results(text): """Format analysis results into structured HTML""" # First clean the text cleaned_text = clean_llm_output(text) # Split into sections sections = [] current_section = "" current_title = "" for line in cleaned_text.split('\n'): line = line.strip() if ':' in line and not line.startswith('*'): # If we have a previous section, save it if current_title: sections.append((current_title, current_section.strip())) # Start new section parts = line.split(':', 1) current_title = parts[0].strip() current_section = parts[1].strip() if len(parts) > 1 else "" else: current_section += " " + line # Add the last section if current_title: sections.append((current_title, current_section.strip())) # Generate HTML html = "
{content}