cs-mubashir commited on
Commit
4f7f1a1
·
verified ·
1 Parent(s): 2249594

Upload 5 files

Browse files
Files changed (5) hide show
  1. .env +1 -0
  2. app.py +85 -0
  3. paper_reading.py +364 -0
  4. requirements.txt +15 -0
  5. streamlit.py +91 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GOOGLE_API_KEY = 'AIzaSyDwEobLGx532VeIhPky_9D73Racz718AC8'
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ import os
5
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
+ import google.generativeai as genai
7
+ from langchain.vectorstores import FAISS
8
+ from langchain_google_genai import ChatGoogleGenerativeAI
9
+ from langchain.chains.question_answering import load_qa_chain
10
+ from langchain.prompts import PromptTemplate
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+ os.getenv("GOOGLE_API_KEY")
15
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
16
+
17
+
18
+
19
+
20
+
21
+
22
+ def get_pdf_text(pdf_docs):
23
+ text=""
24
+ for pdf in pdf_docs:
25
+ pdf_reader= PdfReader(pdf)
26
+ for page in pdf_reader.pages:
27
+ text+= page.extract_text()
28
+ return text
29
+
30
+
31
+
32
+ def get_text_chunks(text):
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
34
+ chunks = text_splitter.split_text(text)
35
+ return chunks
36
+
37
+
38
+ def get_vector_store(text_chunks):
39
+ embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
40
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
41
+ vector_store.save_local("faiss_index")
42
+
43
+
44
+ def get_conversational_chain():
45
+
46
+ prompt_template = """
47
+ Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
48
+ provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
49
+ Context:\n {context}?\n
50
+ Question: \n{question}\n
51
+
52
+ Answer:
53
+ """
54
+
55
+ model = ChatGoogleGenerativeAI(model="gemini-pro",
56
+ temperature=0.3)
57
+
58
+ prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
59
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
60
+
61
+ return chain
62
+
63
+
64
+
65
+ def user_input(user_question):
66
+ embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
67
+
68
+ new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
69
+ docs = new_db.similarity_search(user_question)
70
+
71
+ chain = get_conversational_chain()
72
+
73
+
74
+ response = chain(
75
+ {"input_documents":docs, "question": user_question}
76
+ , return_only_outputs=True)
77
+
78
+ print(response)
79
+ st.write("Reply: ", response["output_text"])
80
+
81
+
82
+
83
+
84
+
85
+
paper_reading.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import PyPDF2
3
+ import re
4
+ import os
5
+ import requests
6
+ import pandas as pd
7
+ import tiktoken
8
+ import time
9
+ from io import StringIO
10
+ from groq import Groq
11
+
12
+ import numpy as np
13
+
14
+
15
+ api_key='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'
16
+ client = Groq(api_key=api_key)
17
+
18
+
19
+ def count_tokens(text):
20
+ """Returns the number of tokens in a text string."""
21
+ encoding = tiktoken.get_encoding("cl100k_base")
22
+ num_tokens = len(encoding.encode(text))
23
+ return num_tokens
24
+
25
+ def get_pdf_files(folder_path):
26
+ """
27
+ Retrieve PDF files from the specified folder path with improved error handling.
28
+
29
+ Args:
30
+ folder_path (str): Path to the folder containing PDF files
31
+
32
+ Returns:
33
+ list: List of full paths to PDF files
34
+ """
35
+ # Validate folder path
36
+ if not os.path.exists(folder_path):
37
+ raise ValueError(f"Folder path does not exist: {folder_path}")
38
+
39
+ # List to store PDF file paths
40
+ pdf_files = []
41
+
42
+ # Walk through directory
43
+ for root, dirs, files in os.walk(folder_path):
44
+ for file in files:
45
+ # Check if file is a PDF
46
+ if file.lower().endswith('.pdf'):
47
+ full_path = os.path.join(root, file)
48
+ pdf_files.append(full_path)
49
+
50
+ # Check if any PDFs were found
51
+ if not pdf_files:
52
+ raise ValueError(f"No PDF files found in the folder: {folder_path}")
53
+
54
+ return pdf_files
55
+
56
+
57
+ def get_txt_from_pdf(pdf_files, filter_ref=False):
58
+
59
+
60
+ data = []
61
+
62
+ for pdf in pdf_files:
63
+
64
+ try:
65
+ with open(pdf, 'rb') as pdf_content:
66
+
67
+ pdf_reader = PyPDF2.PdfReader(pdf_content)
68
+
69
+ for page_num in range(len(pdf_reader.pages)):
70
+ page = pdf_reader.pages[page_num]
71
+ page_text = page.extract_text()
72
+ words = page_text.split()
73
+ page_text_join = ' '.join(words)
74
+
75
+ if filter_ref:
76
+ page_text_join = remove_ref(page_text_join)
77
+
78
+ page_len = len(page_text_join)
79
+ div_len = page_len // 4 # Divide the page into 4 parts
80
+ page_parts = [page_text_join[i*div_len:(i+1)*div_len] for i in range(4)]
81
+
82
+ min_tokens = 40
83
+ for i, page_part in enumerate(page_parts):
84
+ if count_tokens(page_part) > min_tokens:
85
+ # Append the data to the list
86
+ data.append({
87
+ 'file name': os.path.basename(pdf),
88
+ 'page number': page_num + 1,
89
+ 'page section': i+1,
90
+ 'content': page_part,
91
+ 'tokens': count_tokens(page_part)
92
+ })
93
+ except Exception as e:
94
+ print(f"Error processing {pdf}: {e}")
95
+
96
+ # Create a DataFrame from the data
97
+ df = pd.DataFrame(data)
98
+ return df
99
+
100
+ def remove_ref(pdf_text):
101
+
102
+ pattern = r'(REFERENCES|Acknowledgment|ACKNOWLEDGMENT)'
103
+ match = re.search(pattern, pdf_text)
104
+
105
+ if match:
106
+ # If a match is found, remove everything after the match
107
+ start_index = match.start()
108
+ clean_text = pdf_text[:start_index].strip()
109
+ else:
110
+ # Define a list of regular expression patterns for references
111
+ reference_patterns = [
112
+ '\[[\d\w]{1,3}\].+?[\d]{3,5}\.','\[[\d\w]{1,3}\].+?[\d]{3,5};','\([\d\w]{1,3}\).+?[\d]{3,5}\.','\[[\d\w]{1,3}\].+?[\d]{3,5},',
113
+ '\([\d\w]{1,3}\).+?[\d]{3,5},','\[[\d\w]{1,3}\].+?[\d]{3,5}','[\d\w]{1,3}\).+?[\d]{3,5}\.','[\d\w]{1,3}\).+?[\d]{3,5}',
114
+ '\([\d\w]{1,3}\).+?[\d]{3,5}','^[\w\d,\.– ;)-]+$',
115
+ ]
116
+
117
+ # Find and remove matches with the first eight patterns
118
+ for pattern in reference_patterns[:8]:
119
+ matches = re.findall(pattern, pdf_text, flags=re.S)
120
+ pdf_text = re.sub(pattern, '', pdf_text) if len(matches) > 500 and matches.count('.') < 2 and matches.count(',') < 2 and not matches[-1].isdigit() else pdf_text
121
+
122
+ # Split the text into lines
123
+ lines = pdf_text.split('\n')
124
+
125
+ # Strip each line and remove matches with the last two patterns
126
+ for i, line in enumerate(lines):
127
+ lines[i] = line.strip()
128
+ for pattern in reference_patterns[7:]:
129
+ matches = re.findall(pattern, lines[i])
130
+ lines[i] = re.sub(pattern, '', lines[i]) if len(matches) > 500 and len(re.findall('\d', matches)) < 8 and len(set(matches)) > 10 and matches.count(',') < 2 and len(matches) > 20 else lines[i]
131
+
132
+ # Join the lines back together, excluding any empty lines
133
+ clean_text = '\n'.join([line for line in lines if line])
134
+
135
+ return clean_text
136
+
137
+ def split_content(input_string, tokens):
138
+ """Splits a string into chunks based on a maximum token count. """
139
+
140
+ MAX_TOKENS = tokens
141
+ split_strings = []
142
+ current_string = ""
143
+ tokens_so_far = 0
144
+
145
+ for word in input_string.split():
146
+ # Check if adding the next word would exceed the max token limit
147
+ if tokens_so_far + count_tokens(word) > MAX_TOKENS:
148
+ # If we've reached the max tokens, look for the last dot or newline in the current string
149
+ last_dot = current_string.rfind(".")
150
+ last_newline = current_string.rfind("\n")
151
+
152
+ # Find the index to cut the current string
153
+ cut_index = max(last_dot, last_newline)
154
+
155
+ # If there's no dot or newline, we'll just cut at the max tokens
156
+ if cut_index == -1:
157
+ cut_index = MAX_TOKENS
158
+
159
+ # Add the substring to the result list and reset the current string and tokens_so_far
160
+ split_strings.append(current_string[:cut_index + 1].strip())
161
+ current_string = current_string[cut_index + 1:].strip()
162
+ tokens_so_far = count_tokens(current_string)
163
+
164
+ # Add the current word to the current string and update the token count
165
+ current_string += " " + word
166
+ tokens_so_far += count_tokens(word)
167
+
168
+ # Add the remaining current string to the result list
169
+ split_strings.append(current_string.strip())
170
+
171
+ return split_strings
172
+
173
+
174
+ def combine_section(df):
175
+ """Merge sections, page numbers, add up content, and tokens based on the pdf name."""
176
+ aggregated_df = df.groupby('file name').agg({
177
+ 'content': aggregate_content,
178
+ 'tokens': aggregate_tokens
179
+ }).reset_index()
180
+
181
+ return aggregated_df
182
+ def combine_main_SI(df):
183
+ """Create a new column with the main part of the file name, group the DataFrame by the new column,
184
+ and aggregate the content and tokens."""
185
+ df['main_part'] = df['file name'].apply(extract_title)
186
+ merged_df = df.groupby('main_part').agg({
187
+ 'content': ''.join,
188
+ 'tokens': sum
189
+ }).reset_index()
190
+
191
+ return merged_df.rename(columns={'main_part': 'file name'})
192
+
193
+
194
+
195
+ def aggregate_content(series):
196
+ """Join all elements in the series with a space separator. """
197
+ return ' '.join(series)
198
+
199
+
200
+ def aggregate_tokens(series):
201
+ """Sum all elements in the series."""
202
+ return series.sum()
203
+
204
+
205
+ def extract_title(file_name):
206
+ """Extract the main part of the file name. """
207
+ title = file_name.split('_')[0]
208
+ return title.rstrip('.pdf')
209
+
210
+
211
+ def model_1(df):
212
+ """Model 1 will turn text in dataframe to a summarized reaction condition table."""
213
+ # Initialize Groq client
214
+
215
+
216
+ response_msgs = []
217
+
218
+ for index, row in df.iterrows():
219
+ column1_value = row[df.columns[0]]
220
+ column2_value = row['content']
221
+
222
+ max_tokens = 3000
223
+ if count_tokens(column2_value) > max_tokens:
224
+ context_list = split_content(column2_value, max_tokens)
225
+ else:
226
+ context_list = [column2_value]
227
+
228
+ answers = '' # Collect answers from Groq
229
+ for context in context_list:
230
+ print("Start to analyze paper " + str(column1_value))
231
+ user_prompt = f"""This is an experimental section on MOF synthesis from paper {column1_value}
232
+
233
+ Context:
234
+ {context}
235
+
236
+ Q: Can you summarize the following details in a table:
237
+ compound name or chemical formula (if the name is not provided), metal source, metal amount, organic linker(s),
238
+ linker amount, modulator, modulator amount or volume, solvent(s), solvent volume(s), reaction temperature,
239
+ and reaction time?
240
+
241
+ Rules:
242
+ - If any information is not provided or you are unsure, use "N/A"
243
+ - Focus on extracting experimental conditions from only the MOF synthesis
244
+ - Ignore information related to organic linker synthesis, MOF postsynthetic modification, high throughput (HT) experiment details or catalysis reactions
245
+ - If multiple conditions are provided for the same compound, use multiple rows to represent them
246
+ - If multiple units or components are provided for the same factor (e.g., g and mol for the weight, multiple linker or metals, multiple temperature and reaction time, mixed solvents, etc), include them in the same cell and separate by comma
247
+ - The table should have 11 columns, all in lowercase:
248
+ | compound name | metal source | metal amount | linker | linker amount | modulator | modulator amount or volume | solvent | solvent volume | reaction temperature | reaction time |
249
+
250
+ Respond with ONLY the table."""
251
+
252
+ attempts = 3
253
+ while attempts > 0:
254
+ try:
255
+ response = client.chat.completions.create(
256
+ model="llama-3.1-70b-versatile", # or another available Groq model
257
+ messages=[
258
+ {"role": "system", "content": "You are a helpful assistant specialized in extracting MOF synthesis details."},
259
+ {"role": "user", "content": user_prompt}
260
+ ]
261
+ )
262
+
263
+ answers_text = response.choices[0].message.content
264
+ # Check if response is valid
265
+ if answers_text and not answers_text.lower().startswith("i apologize"):
266
+ answers += '\n' + answers_text
267
+ break
268
+ else:
269
+ raise ValueError("Invalid or apologetic response")
270
+
271
+ except Exception as e:
272
+ attempts -= 1
273
+ if attempts <= 0:
274
+ print(f"Error: Failed to process paper {column1_value}. Skipping. (model 1)")
275
+ break
276
+ print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 1)")
277
+ time.sleep(60)
278
+
279
+ response_msgs.append(answers)
280
+
281
+ df = df.copy()
282
+ df.loc[:, 'summarized'] = response_msgs
283
+ return df
284
+
285
+ def model_2(df):
286
+ """Model 2 identifies experiment sections and combines results"""
287
+
288
+ response_msgs = []
289
+ prev_paper_name = None
290
+ total_pages = df.groupby(df.columns[0])[df.columns[1]].max()
291
+
292
+ for _, row in df.iterrows():
293
+ paper_name = row[df.columns[0]]
294
+ page_number = row[df.columns[1]]
295
+
296
+ if paper_name != prev_paper_name:
297
+ print(f'Processing paper: {paper_name}. Total pages: {total_pages[paper_name]}')
298
+ prev_paper_name = paper_name
299
+
300
+ context = row['content']
301
+
302
+ user_prompt = """I will provide a context. Determine if the section contains a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes.
303
+
304
+ Examples:
305
+ 1. Context: "In a 4-mL scintillation vial, the linker H2PZVDC (91.0 mg, 0.5 mmol, 1 equiv.) was dissolved in N,N-dimethylformamide (DMF) (0.6 mL) upon sonication."
306
+ Answer: Yes
307
+
308
+ 2. Context: "Synthesis and Characterization of MOFs, Abbreviations, and General Procedures."
309
+ Answer: No
310
+
311
+ 3. Context: "The design and synthesis of metal-organic frameworks (MOFs) has yielded a large number of structures"
312
+ Answer: No
313
+
314
+ Respond with only "Yes" or "No" based on the following context:
315
+ """ + context
316
+
317
+ attempts = 3
318
+ while attempts > 0:
319
+ try:
320
+ response = client.chat.completions.create(
321
+ model="llama-3.1-70b-versatile", # or another available Groq model
322
+ messages=[
323
+ {"role": "system", "content": "You are a helpful assistant specialized in identifying MOF synthesis sections."},
324
+ {"role": "user", "content": user_prompt}
325
+ ]
326
+ )
327
+ answers = response.choices[0].message.content.strip()
328
+
329
+ # Validate response
330
+ if answers in ["Yes", "No"]:
331
+ break
332
+ else:
333
+ raise ValueError("Invalid response")
334
+
335
+ except Exception as e:
336
+ attempts -= 1
337
+ if attempts > 0:
338
+ print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 2)")
339
+ time.sleep(60)
340
+ else:
341
+ print(f"Error: Failed to process paper {paper_name}. Skipping. (model 2)")
342
+ answers = "No"
343
+ break
344
+
345
+ response_msgs.append(answers)
346
+
347
+ df = df.copy()
348
+ df.loc[:,'classification'] = response_msgs
349
+
350
+ # Remove consecutive "No" entries
351
+ mask_no = df["classification"].str.startswith("No")
352
+ mask_surrounded_by_no = mask_no.shift(1, fill_value=False) & mask_no.shift(-1, fill_value=False)
353
+ mask_to_remove = mask_no & mask_surrounded_by_no
354
+ filtered_df = df[~mask_to_remove]
355
+
356
+ # Combine sections and process
357
+ combined_df = combine_main_SI(combine_section(filtered_df))
358
+ add_table_df = model_1(combined_df)
359
+ return add_table_df[['file name','summarized']]
360
+
361
+
362
+
363
+
364
+
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ python-dotenv
4
+ langchain
5
+ PyPDF2
6
+ chromadb
7
+ faiss-cpu
8
+ langchain_google_genai
9
+ groq
10
+ langchain-groq
11
+ langchain_community
12
+ requests
13
+ PyPDF2
14
+ pandas
15
+ tiktoken
streamlit.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app import get_pdf_text
3
+ from app import get_text_chunks
4
+ from app import get_vector_store
5
+ from app import user_input
6
+ from paper_reading import get_pdf_files
7
+ from paper_reading import get_txt_from_pdf
8
+ from paper_reading import model_2
9
+
10
+
11
+
12
+ def mof_synthesis_processing(df):
13
+ """Placeholder for MOF synthesis processing - replace with actual implementation"""
14
+ # This is a simplified placeholder - you'll need to implement the actual processing
15
+ st.warning("Full MOF processing implementation needed")
16
+ return model_2(df)
17
+
18
+ def main():
19
+ st.set_page_config("Multi-Functional PDF Tool")
20
+
21
+ # Sidebar for mode selection
22
+ app_mode = st.sidebar.selectbox("Choose Application Mode",
23
+ ["Chat with PDF", "MOF Synthesis Paper Processing"])
24
+
25
+ if app_mode == "Chat with PDF":
26
+ st.header("Chat with PDF using Groq and HuggingFace Embeddings💁")
27
+
28
+ user_question = st.text_input("Ask a Question from the PDF Files")
29
+
30
+ if user_question:
31
+ user_input(user_question)
32
+
33
+ with st.sidebar:
34
+ st.title("Upload PDFs:")
35
+ pdf_docs = st.file_uploader("Upload your PDF Files", accept_multiple_files=True)
36
+ if st.button("Submit & Process"):
37
+ with st.spinner("Processing..."):
38
+ raw_text = get_pdf_text(pdf_docs)
39
+ text_chunks = get_text_chunks(raw_text)
40
+ get_vector_store(text_chunks)
41
+ st.success("PDFs Processed Successfully")
42
+
43
+ elif app_mode == "MOF Synthesis Paper Processing":
44
+ st.title("MOF Synthesis Paper Processing")
45
+
46
+ # Folder input for PDF processing
47
+ folder = st.text_input("Enter the full path to the folder containing PDF files")
48
+
49
+ if folder:
50
+ try:
51
+ folder_path = get_pdf_files(folder)
52
+
53
+ if st.button("Process MOF Synthesis Papers"):
54
+ with st.spinner("Extracting text from PDFs..."):
55
+ # Extract text from PDFs
56
+ pdf_dataframe = get_txt_from_pdf(folder_path)
57
+
58
+ if not pdf_dataframe.empty:
59
+ st.write(f"Extracted text from {len(pdf_dataframe['file name'].unique())} PDFs")
60
+ st.write(f"Total pages processed: {len(pdf_dataframe)}")
61
+
62
+ # Process MOF synthesis papers
63
+ with st.spinner("Analyzing MOF Synthesis Papers..."):
64
+ processed_df = mof_synthesis_processing(pdf_dataframe)
65
+
66
+ if not processed_df.empty:
67
+ # Display results
68
+ st.success("MOF Synthesis Papers Processed Successfully!")
69
+
70
+ # Option to download processed data
71
+ csv = processed_df.to_csv(index=False)
72
+ st.download_button(
73
+ label="Download Processed Data",
74
+ data=csv,
75
+ file_name="mof_synthesis_data.csv",
76
+ mime="text/csv"
77
+ )
78
+
79
+ # Display first few rows
80
+ st.dataframe(processed_df)
81
+ else:
82
+ st.warning("No MOF synthesis data was extracted.")
83
+
84
+ else:
85
+ st.error("No PDFs found or error in extracting text")
86
+
87
+ except Exception as e:
88
+ st.error(f"An error occurred: {e}")
89
+
90
+ if __name__ == "__main__":
91
+ main()