Harishkhawaja commited on
Commit
681d2cc
Β·
verified Β·
1 Parent(s): e5bd2a7

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +70 -24
src/streamlit_app.py CHANGED
@@ -2,44 +2,80 @@ import streamlit as st
2
  from sentence_transformers import SentenceTransformer
3
  import torch
4
  import faiss
5
- import os
6
  import PyPDF2
7
  from groq import Groq
 
 
 
 
8
 
9
  # Load embedding model
10
- model = SentenceTransformer("thenlper/gte-small")
 
 
 
 
 
11
 
12
  # Initialize Groq client
13
- groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 
 
 
 
 
 
 
14
 
15
  def embed_chunks(chunks):
16
- return model.encode(chunks, convert_to_numpy=True)
 
 
 
 
17
 
18
  def chunk_text(text, chunk_size=500, overlap=100):
19
  chunks = []
20
  for i in range(0, len(text), chunk_size - overlap):
21
- chunks.append(text[i:i+chunk_size])
22
  return chunks
23
 
 
24
  def create_faiss_index(embeddings):
25
- dim = embeddings.shape[1]
26
- index = faiss.IndexFlatL2(dim)
27
- index.add(embeddings)
28
- return index
 
 
 
 
29
 
30
  def search_index(query, index, chunks, top_k=5):
31
- query_embedding = embed_chunks([query])
32
- distances, indices = index.search(query_embedding, top_k)
33
- return [chunks[i] for i in indices[0]]
 
 
 
 
 
 
34
 
35
  def extract_text_from_pdf(file):
36
- reader = PyPDF2.PdfReader(file)
37
- text = ""
38
- for page in reader.pages:
39
- text += page.extract_text() or ""
40
- return text
 
 
 
 
41
 
42
  def ask_groq(query, context):
 
 
43
  try:
44
  completion = groq_client.chat.completions.create(
45
  messages=[
@@ -61,10 +97,10 @@ def ask_groq(query, context):
61
  except Exception as e:
62
  return f"Error from Groq API: {e}"
63
 
 
64
  # Streamlit app
65
  st.set_page_config(page_title="Lexicon: Policy Explainer Bot", layout="wide")
66
  st.title("πŸ“œ Lexicon: Understand Policies with Confidence")
67
-
68
  st.markdown("Upload a PDF or paste policy text below. Lexicon will highlight key points and flag potential risks.")
69
 
70
  uploaded_file = st.file_uploader("Upload Policy/T&C PDF", type=["pdf"])
@@ -76,19 +112,29 @@ if uploaded_file or clipboard_text.strip():
76
  else:
77
  text = clipboard_text.strip()
78
 
 
 
 
 
79
  st.success("Document loaded. Processing...")
80
  chunks = chunk_text(text)
81
- embeddings = embed_chunks(chunks)
82
- index = create_faiss_index(embeddings)
 
 
 
83
 
84
  with st.expander("πŸ” Ask a question about this policy"):
85
  query = st.text_input("Enter your question")
86
  if query:
87
  relevant_chunks = search_index(query, index, chunks)
88
- context = "\n\n".join(relevant_chunks)
89
- answer = ask_groq(query=query, context=context)
90
- st.markdown("**Answer:**")
91
- st.info(answer)
 
 
 
92
 
93
  st.markdown("βœ… **Ready for follow-up questions.** Ask anything about clauses, risks, or key terms.")
94
  else:
 
2
  from sentence_transformers import SentenceTransformer
3
  import torch
4
  import faiss
 
5
  import PyPDF2
6
  from groq import Groq
7
+ import os
8
+
9
+ # Check if running in a Hugging Face Space
10
+ HF_SPACE = "HF_SPACE_ID" in os.environ # Corrected check. The env var is HF_SPACE_ID, not SPACE_ID
11
 
12
  # Load embedding model
13
+ try:
14
+ model = SentenceTransformer("thenlper/gte-small")
15
+ except Exception as e:
16
+ st.error(f"Error loading the Sentence Transformer model: {e}. Please ensure the correct version of sentence-transformers is in requirements.txt.")
17
+ # Stop if the model fails to load. Crucial for HuggingFace
18
+ st.stop()
19
 
20
  # Initialize Groq client
21
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
22
+ if not GROQ_API_KEY:
23
+ st.error("GROQ_API_KEY environment variable not set. The app will not be able to query Groq.")
24
+ # Don't stop here, allow basic functionality. Groq features will be unavailable, but the rest can work.
25
+ groq_client = None
26
+ else:
27
+ groq_client = Groq(api_key=GROQ_API_KEY)
28
+
29
 
30
  def embed_chunks(chunks):
31
+ try:
32
+ return model.encode(chunks, convert_to_numpy=True)
33
+ except Exception as e:
34
+ st.error(f"Error embedding chunks: {e}")
35
+ return None # Important: Handle the error, return None
36
 
37
  def chunk_text(text, chunk_size=500, overlap=100):
38
  chunks = []
39
  for i in range(0, len(text), chunk_size - overlap):
40
+ chunks.append(text[i:i + chunk_size])
41
  return chunks
42
 
43
+
44
  def create_faiss_index(embeddings):
45
+ try:
46
+ dim = embeddings.shape[1]
47
+ index = faiss.IndexFlatL2(dim)
48
+ index.add(embeddings)
49
+ return index
50
+ except Exception as e:
51
+ st.error(f"Error creating FAISS index: {e}")
52
+ return None # Important: Handle error
53
 
54
  def search_index(query, index, chunks, top_k=5):
55
+ try:
56
+ query_embedding = embed_chunks([query])
57
+ if query_embedding is None or index is None: # handle errors from embed_chunks or create_faiss_index
58
+ return []
59
+ distances, indices = index.search(query_embedding, top_k)
60
+ return [chunks[i] for i in indices[0]]
61
+ except Exception as e:
62
+ st.error(f"Error searching FAISS index: {e}")
63
+ return []
64
 
65
  def extract_text_from_pdf(file):
66
+ try:
67
+ reader = PyPDF2.PdfReader(file)
68
+ text = ""
69
+ for page in reader.pages:
70
+ text += page.extract_text() or ""
71
+ return text
72
+ except Exception as e:
73
+ st.error(f"Error extracting text from PDF: {e}")
74
+ return ""
75
 
76
  def ask_groq(query, context):
77
+ if groq_client is None:
78
+ return "Groq API key is not configured. This feature is unavailable."
79
  try:
80
  completion = groq_client.chat.completions.create(
81
  messages=[
 
97
  except Exception as e:
98
  return f"Error from Groq API: {e}"
99
 
100
+
101
  # Streamlit app
102
  st.set_page_config(page_title="Lexicon: Policy Explainer Bot", layout="wide")
103
  st.title("πŸ“œ Lexicon: Understand Policies with Confidence")
 
104
  st.markdown("Upload a PDF or paste policy text below. Lexicon will highlight key points and flag potential risks.")
105
 
106
  uploaded_file = st.file_uploader("Upload Policy/T&C PDF", type=["pdf"])
 
112
  else:
113
  text = clipboard_text.strip()
114
 
115
+ if not text: # Handle the case where extraction/clipboard yields empty text
116
+ st.error("No text was extracted from the PDF or provided in the text area. Please check your input.")
117
+ st.stop()
118
+
119
  st.success("Document loaded. Processing...")
120
  chunks = chunk_text(text)
121
+ embeddings = embed_chunks(chunks) # embeddings can be None if error
122
+ if embeddings is not None:
123
+ index = create_faiss_index(embeddings) # index can be None if error
124
+ else:
125
+ index = None
126
 
127
  with st.expander("πŸ” Ask a question about this policy"):
128
  query = st.text_input("Enter your question")
129
  if query:
130
  relevant_chunks = search_index(query, index, chunks)
131
+ if relevant_chunks: # only call groq if relevant chunks were found.
132
+ context = "\n\n".join(relevant_chunks)
133
+ answer = ask_groq(query=query, context=context)
134
+ st.markdown("**Answer:**")
135
+ st.info(answer)
136
+ else:
137
+ st.info("No relevant information found in the document to answer your question.")
138
 
139
  st.markdown("βœ… **Ready for follow-up questions.** Ask anything about clauses, risks, or key terms.")
140
  else: