import os import tempfile import streamlit as st import PyPDF2 import faiss import numpy as np from sentence_transformers import SentenceTransformer from groq import Groq from gtts import gTTS # 🚨 Must be the first Streamlit command st.set_page_config(page_title="🌍 Climate Companion", layout="wide") # Load model and Groq client once @st.cache_resource def load_model(): return SentenceTransformer("all-MiniLM-L6-v2") @st.cache_resource def load_groq_client(): return Groq(api_key=os.getenv("GROQ_API_KEY")) embed_model = load_model() client = load_groq_client() # UI Header st.markdown( "
Upload a climate report and ask environment-related questions.
", unsafe_allow_html=True ) # PDF uploader uploaded_file = st.file_uploader("📄 Upload Climate Report (PDF)", type="pdf") # Text chunking def chunk_text(text, max_tokens=100, overlap=20): words = text.split() chunks = [] for i in range(0, len(words), max_tokens - overlap): chunk = " ".join(words[i:i + max_tokens]) if chunk.strip(): chunks.append(chunk) return chunks # Process file only once per session if uploaded_file: if "processed_file" not in st.session_state or st.session_state.processed_file != uploaded_file.name: with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.read()) tmp_path = tmp_file.name try: with open(tmp_path, "rb") as f: reader = PyPDF2.PdfReader(f) full_text = "\n".join([page.extract_text() or "" for page in reader.pages]) except Exception as e: st.error(f"❌ Failed to read PDF: {e}") st.stop() if not full_text.strip(): st.error("❌ No extractable text found in the PDF.") st.stop() st.success("✅ Extracted text from PDF successfully.") # Chunk + Embed with st.spinner("🔄 Chunking and embedding text..."): chunks = chunk_text(full_text) embeddings = embed_model.encode(chunks, show_progress_bar=True) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings).astype("float32")) # Store in session_state st.session_state.processed_file = uploaded_file.name st.session_state.chunks = chunks st.session_state.index = index st.session_state.dimension = dimension st.success(f"📚 {len(chunks)} text chunks embedded and indexed.") else: chunks = st.session_state.chunks index = st.session_state.index dimension = st.session_state.dimension st.success("✅ Using cached embeddings from this session.") # Question and Answer section st.markdown("---") st.subheader("🌱 Ask a Climate-Related Question") col1, col2 = st.columns([5, 1]) question = col1.text_input("Enter your question here") submit = col2.button("🔍 Get Answer") if submit and question: with st.spinner("🧠 Generating response..."): q_embed = embed_model.encode([question]) _, indices = index.search(np.array(q_embed).astype("float32"), k=3) top_chunks = [chunks[i] for i in indices[0]] context = "\n".join(top_chunks) prompt = f""" You are a climate science expert. Use the context to answer the user's question concisely. Context: {context} Question: {question} """ try: response = client.chat.completions.create( model="llama3-8b-8192", messages=[ {"role": "system", "content": "You are a helpful environmental scientist."}, {"role": "user", "content": prompt} ] ) answer = response.choices[0].message.content.strip() st.markdown("### ✅ Answer") st.markdown( f"