Smart_Notes

Sleeping

App Files Files Community

amanapk commited on Jun 11, 2025

Commit

7c331c3

verified ·

1 Parent(s): b03fe49

Update app.py

Browse files

Files changed (1) hide show

app.py +219 -147

app.py CHANGED Viewed

@@ -1,168 +1,240 @@
 import streamlit as st
-import pymupdf
-import re
-import traceback
 import faiss
 import numpy as np
-import requests
-from rank_bm25 import BM25Okapi
-from sentence_transformers import SentenceTransformer
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_groq import ChatGroq
-import torch
-import os
-st.set_page_config(page_title="Financial Insights Chatbot", page_icon="📊", layout="wide")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-ALPHA_VANTAGE_API_KEY = os.getenv("ALPHA_VANTAGE_API_KEY")
-try:
-    llm = ChatGroq(temperature=0, model="llama3-70b-8192", api_key=GROQ_API_KEY)
-    st.success("✅ LLM initialized successfully. Using llama3-70b-8192")
-except Exception as e:
-    st.error("❌ Failed to initialize Groq LLM.")
-    traceback.print_exc()
-embedding_model = SentenceTransformer("baconnier/Finance2_embedding_small_en-V1.5", device=device)
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-def fetch_financial_data(company_ticker):
-    if not company_ticker:
-        return "No ticker symbol provided. Please enter a valid company ticker."
-    try:
-        overview_url = f"https://www.alphavantage.co/query?function=OVERVIEW&symbol={company_ticker}&apikey={ALPHA_VANTAGE_API_KEY}"
-        overview_response = requests.get(overview_url)
-        if overview_response.status_code == 200:
-            overview_data = overview_response.json()
-            market_cap = overview_data.get("MarketCapitalization", "N/A")
-        else:
-            return "Error fetching company overview."
-        income_url = f"https://www.alphavantage.co/query?function=INCOME_STATEMENT&symbol={company_ticker}&apikey={ALPHA_VANTAGE_API_KEY}"
-        income_response = requests.get(income_url)
-        if income_response.status_code == 200:
-            income_data = income_response.json()
-            annual_reports = income_data.get("annualReports", [])
-            revenue = annual_reports[0].get("totalRevenue", "N/A") if annual_reports else "N/A"
-        else:
-            return "Error fetching income statement."
-        return f"Market Cap: ${market_cap}\nTotal Revenue: ${revenue}"
-    except Exception as e:
-        traceback.print_exc()
-        return "Error fetching financial data."
-def extract_and_embed_text(pdf_file):
-    """Processes PDFs and generates embeddings with GPU acceleration using pymupdf."""
-    try:
-        docs, tokenized_texts = [], []
-        with pymupdf.open(stream=pdf_file.read(), filetype="pdf") as doc:
-            full_text = "\n".join(page.get_text("text") for page in doc)
-            chunks = text_splitter.split_text(full_text)
-            for chunk in chunks:
-                docs.append(chunk)
-                tokenized_texts.append(chunk.split())
-        embeddings = embedding_model.encode(docs, batch_size=64, convert_to_numpy=True, normalize_embeddings=True)
-        embedding_dim = embeddings.shape[1]
-        index = faiss.IndexHNSWFlat(embedding_dim, 32)
-        index.add(embeddings)
-        bm25 = BM25Okapi(tokenized_texts)
-        return docs, embeddings, index, bm25
-    except Exception as e:
-        traceback.print_exc()
-        return [], [], None, None
-def retrieve_relevant_docs(user_query, docs, index, bm25):
-    """Hybrid search using FAISS cosine similarity & BM25 keyword retrieval."""
-    query_embedding = embedding_model.encode(user_query, convert_to_numpy=True, normalize_embeddings=True)
-    _, faiss_indices = index.search(np.array([query_embedding]), 8)
-    bm25_scores = bm25.get_scores(user_query.split())
-    bm25_indices = np.argsort(bm25_scores)[::-1][:8]
-    combined_indices = list(set(faiss_indices[0]) | set(bm25_indices))
-    return [docs[i] for i in combined_indices[:3]]
-def generate_response(user_query, pdf_ticker, ai_ticker, mode, uploaded_file):
     try:
-        if mode == "📄 PDF Upload Mode":
-            docs, embeddings, index, bm25 = extract_and_embed_text(uploaded_file)
-            if not docs:
-                return "❌ Error extracting text from PDF."
-            retrieved_docs = retrieve_relevant_docs(user_query, docs, index, bm25)
-            context = "\n\n".join(retrieved_docs)
-            prompt = f"Summarize the key financial insights for {pdf_ticker} from this document:\n\n{context}"
-        elif mode == "🌍 Live Data Mode":
-            financial_info = fetch_financial_data(ai_ticker)
-            prompt = f"Analyze the financial status of {ai_ticker} based on:\n{financial_info}\n\nUser Query: {user_query}"
-        else:
-            return "Invalid mode selected."
-        response = llm.invoke(prompt)
-        return response.content
     except Exception as e:
-        traceback.print_exc()
-        return "Error generating response."
-st.markdown(
-    "<h1 style='text-align: center; color: #4CAF50;'>📄 FinQuery RAG Chatbot</h1>",
-    unsafe_allow_html=True
-)
-st.markdown(
-    "<h5 style='text-align: center; color: #666;'>Analyze financial reports or fetch live financial data effortlessly!</h5>",
-    unsafe_allow_html=True
-)
-col1, col2 = st.columns(2)
-with col1:
-    st.markdown("### 🏢 **Choose Your Analysis Mode**")
-    mode = st.radio("", ["📄 PDF Upload Mode", "🌍 Live Data Mode"], horizontal=True)
-with col2:
-    st.markdown("### 🔎 **Enter Your Query**")
-    user_query = st.text_input("💬 What financial insights are you looking for?")
-st.markdown("---")
-uploaded_file, company_ticker = None, None
-if mode == "📄 PDF Upload Mode":
-    st.markdown("### 📂 Upload Your Financial Report")
-    uploaded_file = st.file_uploader("🔼 Upload PDF Report", type=["pdf"])
-    company_ticker = None
 else:
-    st.markdown("### 🌍 Live Market Data")
-    company_ticker = st.text_input("🏢 Enter Company Ticker Symbol", placeholder="e.g., AAPL, MSFT")
-    uploaded_file = None
-# 🎯 Submit Button
-if st.button("🚀 Analyze Now"):
-    if mode == "📄 PDF Upload Mode" and not uploaded_file:
-        st.error("❌ Please upload a PDF file.")
-    elif mode == "🌍 Live Data Mode" and not company_ticker:
-        st.error("❌ Please enter a valid company ticker symbol.")
-    else:
-        with st.spinner("🔍 Your Query is Processing, this can take upto 5 - 7 minutes⏳"):
-            response = generate_response(user_query, company_ticker, mode, uploaded_file)
-            st.markdown("---")
-            st.markdown("<h3 style='color: #4CAF50;'>💡 AI Response</h3>", unsafe_allow_html=True)
-            st.write(response)
-# 📌 Footer
-st.markdown("---")

 import streamlit as st
+from google.api_core.client_options import ClientOptions
+from google.cloud import documentai_v1
+from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
+import textwrap
+import os
+import json
+import tempfile
+import os
+# ------------------- Secure Credential Loading for Hugging Face ------------------- #
+# This section loads the Service Account from Hugging Face Secrets for ADC
+# 1. Load the Service Account JSON string from the environment variable (secret)
+gcp_credentials_json_str = os.getenv("GCP_CREDENTIALS_JSON")
+project_id =  "wise-env-461717-t5" # Initialize project_id
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# 2. Check if the secret is present
+if gcp_credentials_json_str:
     try:
+        # --- FIX: Write to the /tmp/ directory, which is writable on Hugging Face Spaces ---
+        credentials_file_path = "/tmp/gcp_service_account.json"
+        # 3. Write the JSON string to the file in the temporary directory
+        with open(credentials_file_path, "w") as f:
+            f.write(gcp_credentials_json_str)
+        # 4. Set the environment variable to point to this file
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file_path
+        # Extract project_id from the credentials for convenience
+        creds_dict = json.loads(gcp_credentials_json_str)
+        project_id = creds_dict.get("project_id")
     except Exception as e:
+        st.error(f"🚨 Failed to process GCP credentials: {e}")
+        st.stop()
+else:
+    st.error("🚨 GCP_CREDENTIALS_JSON secret not found! Please add it to your Hugging Face Space settings.")
+    st.stop()
+# ------------------- Configuration ------------------- #
+# Project ID is now dynamically loaded from the service account
+if not project_id:
+    st.error("🚨 Project ID could not be found in the GCP credentials.")
+    st.stop()
+# You still need to provide your Processor ID and location
+processor_id = "86a7eec52bbb9616" # <-- REPLACE WITH YOUR PROCESSOR ID
+location = "us" # e.g., "us" or "eu"
+# ------------------- Google Document AI Client (Uses ADC) ------------------- #
+# The client now automatically finds and uses the credentials set above
+try:
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
+    docai_client = documentai_v1.DocumentProcessorServiceClient(client_options=opts)
+    full_processor_name = docai_client.processor_path(project_id, location, processor_id)
+except Exception as e:
+    st.error(f"Error initializing Document AI client: {e}")
+    st.stop()
+@st.cache_resource
+def load_embedding_model():
+    # Use a writable cache directory
+    cache_dir = "/tmp/hf_cache"
+    os.makedirs(cache_dir, exist_ok=True)
+    # Set Hugging Face environment variables
+    os.environ["TRANSFORMERS_CACHE"] = cache_dir
+    os.environ["HF_HOME"] = cache_dir
+    # Load embedding model
+    return SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
+# ------------------- Utility Functions ------------------- #
+def chunk_text(text, max_chars=500):
+    return textwrap.wrap(text, max_chars)
+def extract_text_with_documentai(file_path):
+    with open(file_path, "rb") as f:
+        content = f.read()
+    raw_document = documentai_v1.RawDocument(content=content, mime_type="application/pdf")
+    request = documentai_v1.ProcessRequest(name=full_processor_name, raw_document=raw_document)
+    result = docai_client.process_document(request=request)
+    document = result.document
+    return document.text
+def build_index(text):
+    text_chunks = chunk_text(text)
+    embeddings = embed_model.encode(text_chunks)
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(np.array(embeddings))
+    return index, text_chunks
+def retrieve_context(query, index, text_chunks, top_k=5):
+    query_embed = embed_model.encode([query])
+    distances, indices = index.search(np.array(query_embed), top_k)
+    return [text_chunks[i] for i in indices[0]]
+# ------------------- Gemini API Functions ------------------- #
+def ask_groq_agent(query, context):
+    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
+    response = requests.post(
+        "https://api.groq.com/openai/v1/chat/completions",
+        headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
+        json={
+            "model": "llama3-70b-8192",
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.3
+        }
+    )
+    return response.json()["choices"][0]["message"]["content"]
+def get_summary(text):
+    prompt = f"Please provide a concise summary of the following document:\n\n{text[:4000]}"
+    response = requests.post(
+        "https://api.groq.com/openai/v1/chat/completions",
+        headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
+        json={
+            "model": "llama3-70b-8192",
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.3
+        }
+    )
+    return response.json()["choices"][0]["message"]["content"]
+def generate_flashcards(text_chunks):
+    joined_text = "\n".join(text_chunks)
+    prompt = (
+        "Generate 5 helpful flashcards from the following content. "
+        "Use the format exactly like this:\n\n"
+        "Q: What is ...?\nA: ...\n\nQ: How does ...?\nA: ...\n\n"
+        "Text:\n" + joined_text
+    )
+    response = requests.post(
+        "https://api.groq.com/openai/v1/chat/completions",
+        headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
+        json={
+            "model": "llama3-70b-8192",
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.5
+        }
+    )
+    content = response.json()["choices"][0]["message"]["content"]
+    flashcards = []
+    question = None
+    for line in content.strip().splitlines():
+        line = line.strip()
+        if line.lower().startswith("q:"):
+            question = line[2:].strip()
+        elif line.lower().startswith("a:") and question:
+            answer = line[2:].strip()
+            flashcards.append({"question": question, "answer": answer})
+            question = None
+    return flashcards
+st.title("📄 PDF AI Assistant (Groq + DocAI)")
+if "index" not in st.session_state:
+    st.session_state.index = None
+    st.session_state.text_chunks = []
+    st.session_state.raw_text = ""
+with st.sidebar:
+    st.header("📤 Upload PDF")
+    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+    if uploaded_file is not None:
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                tmp_file.write(uploaded_file.read())
+                tmp_file.flush()
+                tmp_path = tmp_file.name
+            # DEBUG: File info
+            st.write("Saved file at:", tmp_path)
+            st.write("File size:", os.path.getsize(tmp_path), "bytes")
+            st.write("File exists:", os.path.exists(tmp_path))
+            with st.spinner("Extracting text using Document AI..."):
+                raw_text = extract_text_with_documentai(tmp_path)
+                index, text_chunks = build_index(raw_text)
+                st.session_state.index = index
+                st.session_state.text_chunks = text_chunks
+                st.session_state.raw_text = raw_text
+                st.success("✅ Document processed successfully.")
+        except Exception as e:
+            st.error(f"Error: {e}")
+        finally:
+            os.unlink(tmp_path)
+# ------------------- Q&A Interface ------------------- #
+st.subheader("❓ Ask Questions")
+if st.session_state.index:
+    question = st.text_input("Enter your question")
+    if st.button("Ask"):
+        context = "\n\n".join(retrieve_context(question, st.session_state.index, st.session_state.text_chunks))
+        answer = ask_groq_agent(question, context)
+        st.markdown(f"**Answer:** {answer}")
+else:
+    st.info("Upload a PDF to start asking questions.")
+# ------------------- Summary Interface ------------------- #
+st.subheader("📝 Document Summary")
+if st.session_state.text_chunks:
+    if st.button("Generate Summary"):
+        with st.spinner("Generating summary..."):
+            summary = get_summary(" ".join(st.session_state.text_chunks))
+            st.markdown(summary)
+else:
+    st.info("Upload a PDF to get a summary.")
+# ------------------- Flashcards ------------------- #
+st.subheader("🧠 Flashcards")
+if st.session_state.text_chunks:
+    if st.button("Generate Flashcards"):
+        with st.spinner("Generating flashcards..."):
+            flashcards = generate_flashcards(st.session_state.text_chunks)
+            for fc in flashcards:
+                st.markdown(f"**Q: {fc['question']}**\n\nA: {fc['answer']}")
 else:
+    st.info("Upload a PDF to generate flashcards.")