Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- .env +1 -0
- app.py +85 -0
- paper_reading.py +364 -0
- requirements.txt +15 -0
- streamlit.py +91 -0
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
GOOGLE_API_KEY = 'AIzaSyDwEobLGx532VeIhPky_9D73Racz718AC8'
|
app.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from PyPDF2 import PdfReader
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
import os
|
| 5 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 6 |
+
import google.generativeai as genai
|
| 7 |
+
from langchain.vectorstores import FAISS
|
| 8 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 9 |
+
from langchain.chains.question_answering import load_qa_chain
|
| 10 |
+
from langchain.prompts import PromptTemplate
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
|
| 13 |
+
load_dotenv()
|
| 14 |
+
os.getenv("GOOGLE_API_KEY")
|
| 15 |
+
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_pdf_text(pdf_docs):
|
| 23 |
+
text=""
|
| 24 |
+
for pdf in pdf_docs:
|
| 25 |
+
pdf_reader= PdfReader(pdf)
|
| 26 |
+
for page in pdf_reader.pages:
|
| 27 |
+
text+= page.extract_text()
|
| 28 |
+
return text
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_text_chunks(text):
|
| 33 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
| 34 |
+
chunks = text_splitter.split_text(text)
|
| 35 |
+
return chunks
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_vector_store(text_chunks):
|
| 39 |
+
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
|
| 40 |
+
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
|
| 41 |
+
vector_store.save_local("faiss_index")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_conversational_chain():
|
| 45 |
+
|
| 46 |
+
prompt_template = """
|
| 47 |
+
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
|
| 48 |
+
provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
|
| 49 |
+
Context:\n {context}?\n
|
| 50 |
+
Question: \n{question}\n
|
| 51 |
+
|
| 52 |
+
Answer:
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
model = ChatGoogleGenerativeAI(model="gemini-pro",
|
| 56 |
+
temperature=0.3)
|
| 57 |
+
|
| 58 |
+
prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
|
| 59 |
+
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
|
| 60 |
+
|
| 61 |
+
return chain
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def user_input(user_question):
|
| 66 |
+
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
|
| 67 |
+
|
| 68 |
+
new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
|
| 69 |
+
docs = new_db.similarity_search(user_question)
|
| 70 |
+
|
| 71 |
+
chain = get_conversational_chain()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
response = chain(
|
| 75 |
+
{"input_documents":docs, "question": user_question}
|
| 76 |
+
, return_only_outputs=True)
|
| 77 |
+
|
| 78 |
+
print(response)
|
| 79 |
+
st.write("Reply: ", response["output_text"])
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
paper_reading.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import PyPDF2
|
| 3 |
+
import re
|
| 4 |
+
import os
|
| 5 |
+
import requests
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import tiktoken
|
| 8 |
+
import time
|
| 9 |
+
from io import StringIO
|
| 10 |
+
from groq import Groq
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
api_key='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'
|
| 16 |
+
client = Groq(api_key=api_key)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def count_tokens(text):
|
| 20 |
+
"""Returns the number of tokens in a text string."""
|
| 21 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
| 22 |
+
num_tokens = len(encoding.encode(text))
|
| 23 |
+
return num_tokens
|
| 24 |
+
|
| 25 |
+
def get_pdf_files(folder_path):
|
| 26 |
+
"""
|
| 27 |
+
Retrieve PDF files from the specified folder path with improved error handling.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
folder_path (str): Path to the folder containing PDF files
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
list: List of full paths to PDF files
|
| 34 |
+
"""
|
| 35 |
+
# Validate folder path
|
| 36 |
+
if not os.path.exists(folder_path):
|
| 37 |
+
raise ValueError(f"Folder path does not exist: {folder_path}")
|
| 38 |
+
|
| 39 |
+
# List to store PDF file paths
|
| 40 |
+
pdf_files = []
|
| 41 |
+
|
| 42 |
+
# Walk through directory
|
| 43 |
+
for root, dirs, files in os.walk(folder_path):
|
| 44 |
+
for file in files:
|
| 45 |
+
# Check if file is a PDF
|
| 46 |
+
if file.lower().endswith('.pdf'):
|
| 47 |
+
full_path = os.path.join(root, file)
|
| 48 |
+
pdf_files.append(full_path)
|
| 49 |
+
|
| 50 |
+
# Check if any PDFs were found
|
| 51 |
+
if not pdf_files:
|
| 52 |
+
raise ValueError(f"No PDF files found in the folder: {folder_path}")
|
| 53 |
+
|
| 54 |
+
return pdf_files
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_txt_from_pdf(pdf_files, filter_ref=False):
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
data = []
|
| 61 |
+
|
| 62 |
+
for pdf in pdf_files:
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
with open(pdf, 'rb') as pdf_content:
|
| 66 |
+
|
| 67 |
+
pdf_reader = PyPDF2.PdfReader(pdf_content)
|
| 68 |
+
|
| 69 |
+
for page_num in range(len(pdf_reader.pages)):
|
| 70 |
+
page = pdf_reader.pages[page_num]
|
| 71 |
+
page_text = page.extract_text()
|
| 72 |
+
words = page_text.split()
|
| 73 |
+
page_text_join = ' '.join(words)
|
| 74 |
+
|
| 75 |
+
if filter_ref:
|
| 76 |
+
page_text_join = remove_ref(page_text_join)
|
| 77 |
+
|
| 78 |
+
page_len = len(page_text_join)
|
| 79 |
+
div_len = page_len // 4 # Divide the page into 4 parts
|
| 80 |
+
page_parts = [page_text_join[i*div_len:(i+1)*div_len] for i in range(4)]
|
| 81 |
+
|
| 82 |
+
min_tokens = 40
|
| 83 |
+
for i, page_part in enumerate(page_parts):
|
| 84 |
+
if count_tokens(page_part) > min_tokens:
|
| 85 |
+
# Append the data to the list
|
| 86 |
+
data.append({
|
| 87 |
+
'file name': os.path.basename(pdf),
|
| 88 |
+
'page number': page_num + 1,
|
| 89 |
+
'page section': i+1,
|
| 90 |
+
'content': page_part,
|
| 91 |
+
'tokens': count_tokens(page_part)
|
| 92 |
+
})
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"Error processing {pdf}: {e}")
|
| 95 |
+
|
| 96 |
+
# Create a DataFrame from the data
|
| 97 |
+
df = pd.DataFrame(data)
|
| 98 |
+
return df
|
| 99 |
+
|
| 100 |
+
def remove_ref(pdf_text):
|
| 101 |
+
|
| 102 |
+
pattern = r'(REFERENCES|Acknowledgment|ACKNOWLEDGMENT)'
|
| 103 |
+
match = re.search(pattern, pdf_text)
|
| 104 |
+
|
| 105 |
+
if match:
|
| 106 |
+
# If a match is found, remove everything after the match
|
| 107 |
+
start_index = match.start()
|
| 108 |
+
clean_text = pdf_text[:start_index].strip()
|
| 109 |
+
else:
|
| 110 |
+
# Define a list of regular expression patterns for references
|
| 111 |
+
reference_patterns = [
|
| 112 |
+
'\[[\d\w]{1,3}\].+?[\d]{3,5}\.','\[[\d\w]{1,3}\].+?[\d]{3,5};','\([\d\w]{1,3}\).+?[\d]{3,5}\.','\[[\d\w]{1,3}\].+?[\d]{3,5},',
|
| 113 |
+
'\([\d\w]{1,3}\).+?[\d]{3,5},','\[[\d\w]{1,3}\].+?[\d]{3,5}','[\d\w]{1,3}\).+?[\d]{3,5}\.','[\d\w]{1,3}\).+?[\d]{3,5}',
|
| 114 |
+
'\([\d\w]{1,3}\).+?[\d]{3,5}','^[\w\d,\.– ;)-]+$',
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
# Find and remove matches with the first eight patterns
|
| 118 |
+
for pattern in reference_patterns[:8]:
|
| 119 |
+
matches = re.findall(pattern, pdf_text, flags=re.S)
|
| 120 |
+
pdf_text = re.sub(pattern, '', pdf_text) if len(matches) > 500 and matches.count('.') < 2 and matches.count(',') < 2 and not matches[-1].isdigit() else pdf_text
|
| 121 |
+
|
| 122 |
+
# Split the text into lines
|
| 123 |
+
lines = pdf_text.split('\n')
|
| 124 |
+
|
| 125 |
+
# Strip each line and remove matches with the last two patterns
|
| 126 |
+
for i, line in enumerate(lines):
|
| 127 |
+
lines[i] = line.strip()
|
| 128 |
+
for pattern in reference_patterns[7:]:
|
| 129 |
+
matches = re.findall(pattern, lines[i])
|
| 130 |
+
lines[i] = re.sub(pattern, '', lines[i]) if len(matches) > 500 and len(re.findall('\d', matches)) < 8 and len(set(matches)) > 10 and matches.count(',') < 2 and len(matches) > 20 else lines[i]
|
| 131 |
+
|
| 132 |
+
# Join the lines back together, excluding any empty lines
|
| 133 |
+
clean_text = '\n'.join([line for line in lines if line])
|
| 134 |
+
|
| 135 |
+
return clean_text
|
| 136 |
+
|
| 137 |
+
def split_content(input_string, tokens):
|
| 138 |
+
"""Splits a string into chunks based on a maximum token count. """
|
| 139 |
+
|
| 140 |
+
MAX_TOKENS = tokens
|
| 141 |
+
split_strings = []
|
| 142 |
+
current_string = ""
|
| 143 |
+
tokens_so_far = 0
|
| 144 |
+
|
| 145 |
+
for word in input_string.split():
|
| 146 |
+
# Check if adding the next word would exceed the max token limit
|
| 147 |
+
if tokens_so_far + count_tokens(word) > MAX_TOKENS:
|
| 148 |
+
# If we've reached the max tokens, look for the last dot or newline in the current string
|
| 149 |
+
last_dot = current_string.rfind(".")
|
| 150 |
+
last_newline = current_string.rfind("\n")
|
| 151 |
+
|
| 152 |
+
# Find the index to cut the current string
|
| 153 |
+
cut_index = max(last_dot, last_newline)
|
| 154 |
+
|
| 155 |
+
# If there's no dot or newline, we'll just cut at the max tokens
|
| 156 |
+
if cut_index == -1:
|
| 157 |
+
cut_index = MAX_TOKENS
|
| 158 |
+
|
| 159 |
+
# Add the substring to the result list and reset the current string and tokens_so_far
|
| 160 |
+
split_strings.append(current_string[:cut_index + 1].strip())
|
| 161 |
+
current_string = current_string[cut_index + 1:].strip()
|
| 162 |
+
tokens_so_far = count_tokens(current_string)
|
| 163 |
+
|
| 164 |
+
# Add the current word to the current string and update the token count
|
| 165 |
+
current_string += " " + word
|
| 166 |
+
tokens_so_far += count_tokens(word)
|
| 167 |
+
|
| 168 |
+
# Add the remaining current string to the result list
|
| 169 |
+
split_strings.append(current_string.strip())
|
| 170 |
+
|
| 171 |
+
return split_strings
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def combine_section(df):
|
| 175 |
+
"""Merge sections, page numbers, add up content, and tokens based on the pdf name."""
|
| 176 |
+
aggregated_df = df.groupby('file name').agg({
|
| 177 |
+
'content': aggregate_content,
|
| 178 |
+
'tokens': aggregate_tokens
|
| 179 |
+
}).reset_index()
|
| 180 |
+
|
| 181 |
+
return aggregated_df
|
| 182 |
+
def combine_main_SI(df):
|
| 183 |
+
"""Create a new column with the main part of the file name, group the DataFrame by the new column,
|
| 184 |
+
and aggregate the content and tokens."""
|
| 185 |
+
df['main_part'] = df['file name'].apply(extract_title)
|
| 186 |
+
merged_df = df.groupby('main_part').agg({
|
| 187 |
+
'content': ''.join,
|
| 188 |
+
'tokens': sum
|
| 189 |
+
}).reset_index()
|
| 190 |
+
|
| 191 |
+
return merged_df.rename(columns={'main_part': 'file name'})
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def aggregate_content(series):
|
| 196 |
+
"""Join all elements in the series with a space separator. """
|
| 197 |
+
return ' '.join(series)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def aggregate_tokens(series):
|
| 201 |
+
"""Sum all elements in the series."""
|
| 202 |
+
return series.sum()
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def extract_title(file_name):
|
| 206 |
+
"""Extract the main part of the file name. """
|
| 207 |
+
title = file_name.split('_')[0]
|
| 208 |
+
return title.rstrip('.pdf')
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def model_1(df):
|
| 212 |
+
"""Model 1 will turn text in dataframe to a summarized reaction condition table."""
|
| 213 |
+
# Initialize Groq client
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
response_msgs = []
|
| 217 |
+
|
| 218 |
+
for index, row in df.iterrows():
|
| 219 |
+
column1_value = row[df.columns[0]]
|
| 220 |
+
column2_value = row['content']
|
| 221 |
+
|
| 222 |
+
max_tokens = 3000
|
| 223 |
+
if count_tokens(column2_value) > max_tokens:
|
| 224 |
+
context_list = split_content(column2_value, max_tokens)
|
| 225 |
+
else:
|
| 226 |
+
context_list = [column2_value]
|
| 227 |
+
|
| 228 |
+
answers = '' # Collect answers from Groq
|
| 229 |
+
for context in context_list:
|
| 230 |
+
print("Start to analyze paper " + str(column1_value))
|
| 231 |
+
user_prompt = f"""This is an experimental section on MOF synthesis from paper {column1_value}
|
| 232 |
+
|
| 233 |
+
Context:
|
| 234 |
+
{context}
|
| 235 |
+
|
| 236 |
+
Q: Can you summarize the following details in a table:
|
| 237 |
+
compound name or chemical formula (if the name is not provided), metal source, metal amount, organic linker(s),
|
| 238 |
+
linker amount, modulator, modulator amount or volume, solvent(s), solvent volume(s), reaction temperature,
|
| 239 |
+
and reaction time?
|
| 240 |
+
|
| 241 |
+
Rules:
|
| 242 |
+
- If any information is not provided or you are unsure, use "N/A"
|
| 243 |
+
- Focus on extracting experimental conditions from only the MOF synthesis
|
| 244 |
+
- Ignore information related to organic linker synthesis, MOF postsynthetic modification, high throughput (HT) experiment details or catalysis reactions
|
| 245 |
+
- If multiple conditions are provided for the same compound, use multiple rows to represent them
|
| 246 |
+
- If multiple units or components are provided for the same factor (e.g., g and mol for the weight, multiple linker or metals, multiple temperature and reaction time, mixed solvents, etc), include them in the same cell and separate by comma
|
| 247 |
+
- The table should have 11 columns, all in lowercase:
|
| 248 |
+
| compound name | metal source | metal amount | linker | linker amount | modulator | modulator amount or volume | solvent | solvent volume | reaction temperature | reaction time |
|
| 249 |
+
|
| 250 |
+
Respond with ONLY the table."""
|
| 251 |
+
|
| 252 |
+
attempts = 3
|
| 253 |
+
while attempts > 0:
|
| 254 |
+
try:
|
| 255 |
+
response = client.chat.completions.create(
|
| 256 |
+
model="llama-3.1-70b-versatile", # or another available Groq model
|
| 257 |
+
messages=[
|
| 258 |
+
{"role": "system", "content": "You are a helpful assistant specialized in extracting MOF synthesis details."},
|
| 259 |
+
{"role": "user", "content": user_prompt}
|
| 260 |
+
]
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
answers_text = response.choices[0].message.content
|
| 264 |
+
# Check if response is valid
|
| 265 |
+
if answers_text and not answers_text.lower().startswith("i apologize"):
|
| 266 |
+
answers += '\n' + answers_text
|
| 267 |
+
break
|
| 268 |
+
else:
|
| 269 |
+
raise ValueError("Invalid or apologetic response")
|
| 270 |
+
|
| 271 |
+
except Exception as e:
|
| 272 |
+
attempts -= 1
|
| 273 |
+
if attempts <= 0:
|
| 274 |
+
print(f"Error: Failed to process paper {column1_value}. Skipping. (model 1)")
|
| 275 |
+
break
|
| 276 |
+
print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 1)")
|
| 277 |
+
time.sleep(60)
|
| 278 |
+
|
| 279 |
+
response_msgs.append(answers)
|
| 280 |
+
|
| 281 |
+
df = df.copy()
|
| 282 |
+
df.loc[:, 'summarized'] = response_msgs
|
| 283 |
+
return df
|
| 284 |
+
|
| 285 |
+
def model_2(df):
|
| 286 |
+
"""Model 2 identifies experiment sections and combines results"""
|
| 287 |
+
|
| 288 |
+
response_msgs = []
|
| 289 |
+
prev_paper_name = None
|
| 290 |
+
total_pages = df.groupby(df.columns[0])[df.columns[1]].max()
|
| 291 |
+
|
| 292 |
+
for _, row in df.iterrows():
|
| 293 |
+
paper_name = row[df.columns[0]]
|
| 294 |
+
page_number = row[df.columns[1]]
|
| 295 |
+
|
| 296 |
+
if paper_name != prev_paper_name:
|
| 297 |
+
print(f'Processing paper: {paper_name}. Total pages: {total_pages[paper_name]}')
|
| 298 |
+
prev_paper_name = paper_name
|
| 299 |
+
|
| 300 |
+
context = row['content']
|
| 301 |
+
|
| 302 |
+
user_prompt = """I will provide a context. Determine if the section contains a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes.
|
| 303 |
+
|
| 304 |
+
Examples:
|
| 305 |
+
1. Context: "In a 4-mL scintillation vial, the linker H2PZVDC (91.0 mg, 0.5 mmol, 1 equiv.) was dissolved in N,N-dimethylformamide (DMF) (0.6 mL) upon sonication."
|
| 306 |
+
Answer: Yes
|
| 307 |
+
|
| 308 |
+
2. Context: "Synthesis and Characterization of MOFs, Abbreviations, and General Procedures."
|
| 309 |
+
Answer: No
|
| 310 |
+
|
| 311 |
+
3. Context: "The design and synthesis of metal-organic frameworks (MOFs) has yielded a large number of structures"
|
| 312 |
+
Answer: No
|
| 313 |
+
|
| 314 |
+
Respond with only "Yes" or "No" based on the following context:
|
| 315 |
+
""" + context
|
| 316 |
+
|
| 317 |
+
attempts = 3
|
| 318 |
+
while attempts > 0:
|
| 319 |
+
try:
|
| 320 |
+
response = client.chat.completions.create(
|
| 321 |
+
model="llama-3.1-70b-versatile", # or another available Groq model
|
| 322 |
+
messages=[
|
| 323 |
+
{"role": "system", "content": "You are a helpful assistant specialized in identifying MOF synthesis sections."},
|
| 324 |
+
{"role": "user", "content": user_prompt}
|
| 325 |
+
]
|
| 326 |
+
)
|
| 327 |
+
answers = response.choices[0].message.content.strip()
|
| 328 |
+
|
| 329 |
+
# Validate response
|
| 330 |
+
if answers in ["Yes", "No"]:
|
| 331 |
+
break
|
| 332 |
+
else:
|
| 333 |
+
raise ValueError("Invalid response")
|
| 334 |
+
|
| 335 |
+
except Exception as e:
|
| 336 |
+
attempts -= 1
|
| 337 |
+
if attempts > 0:
|
| 338 |
+
print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 2)")
|
| 339 |
+
time.sleep(60)
|
| 340 |
+
else:
|
| 341 |
+
print(f"Error: Failed to process paper {paper_name}. Skipping. (model 2)")
|
| 342 |
+
answers = "No"
|
| 343 |
+
break
|
| 344 |
+
|
| 345 |
+
response_msgs.append(answers)
|
| 346 |
+
|
| 347 |
+
df = df.copy()
|
| 348 |
+
df.loc[:,'classification'] = response_msgs
|
| 349 |
+
|
| 350 |
+
# Remove consecutive "No" entries
|
| 351 |
+
mask_no = df["classification"].str.startswith("No")
|
| 352 |
+
mask_surrounded_by_no = mask_no.shift(1, fill_value=False) & mask_no.shift(-1, fill_value=False)
|
| 353 |
+
mask_to_remove = mask_no & mask_surrounded_by_no
|
| 354 |
+
filtered_df = df[~mask_to_remove]
|
| 355 |
+
|
| 356 |
+
# Combine sections and process
|
| 357 |
+
combined_df = combine_main_SI(combine_section(filtered_df))
|
| 358 |
+
add_table_df = model_1(combined_df)
|
| 359 |
+
return add_table_df[['file name','summarized']]
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
google-generativeai
|
| 3 |
+
python-dotenv
|
| 4 |
+
langchain
|
| 5 |
+
PyPDF2
|
| 6 |
+
chromadb
|
| 7 |
+
faiss-cpu
|
| 8 |
+
langchain_google_genai
|
| 9 |
+
groq
|
| 10 |
+
langchain-groq
|
| 11 |
+
langchain_community
|
| 12 |
+
requests
|
| 13 |
+
PyPDF2
|
| 14 |
+
pandas
|
| 15 |
+
tiktoken
|
streamlit.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from app import get_pdf_text
|
| 3 |
+
from app import get_text_chunks
|
| 4 |
+
from app import get_vector_store
|
| 5 |
+
from app import user_input
|
| 6 |
+
from paper_reading import get_pdf_files
|
| 7 |
+
from paper_reading import get_txt_from_pdf
|
| 8 |
+
from paper_reading import model_2
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def mof_synthesis_processing(df):
|
| 13 |
+
"""Placeholder for MOF synthesis processing - replace with actual implementation"""
|
| 14 |
+
# This is a simplified placeholder - you'll need to implement the actual processing
|
| 15 |
+
st.warning("Full MOF processing implementation needed")
|
| 16 |
+
return model_2(df)
|
| 17 |
+
|
| 18 |
+
def main():
|
| 19 |
+
st.set_page_config("Multi-Functional PDF Tool")
|
| 20 |
+
|
| 21 |
+
# Sidebar for mode selection
|
| 22 |
+
app_mode = st.sidebar.selectbox("Choose Application Mode",
|
| 23 |
+
["Chat with PDF", "MOF Synthesis Paper Processing"])
|
| 24 |
+
|
| 25 |
+
if app_mode == "Chat with PDF":
|
| 26 |
+
st.header("Chat with PDF using Groq and HuggingFace Embeddings💁")
|
| 27 |
+
|
| 28 |
+
user_question = st.text_input("Ask a Question from the PDF Files")
|
| 29 |
+
|
| 30 |
+
if user_question:
|
| 31 |
+
user_input(user_question)
|
| 32 |
+
|
| 33 |
+
with st.sidebar:
|
| 34 |
+
st.title("Upload PDFs:")
|
| 35 |
+
pdf_docs = st.file_uploader("Upload your PDF Files", accept_multiple_files=True)
|
| 36 |
+
if st.button("Submit & Process"):
|
| 37 |
+
with st.spinner("Processing..."):
|
| 38 |
+
raw_text = get_pdf_text(pdf_docs)
|
| 39 |
+
text_chunks = get_text_chunks(raw_text)
|
| 40 |
+
get_vector_store(text_chunks)
|
| 41 |
+
st.success("PDFs Processed Successfully")
|
| 42 |
+
|
| 43 |
+
elif app_mode == "MOF Synthesis Paper Processing":
|
| 44 |
+
st.title("MOF Synthesis Paper Processing")
|
| 45 |
+
|
| 46 |
+
# Folder input for PDF processing
|
| 47 |
+
folder = st.text_input("Enter the full path to the folder containing PDF files")
|
| 48 |
+
|
| 49 |
+
if folder:
|
| 50 |
+
try:
|
| 51 |
+
folder_path = get_pdf_files(folder)
|
| 52 |
+
|
| 53 |
+
if st.button("Process MOF Synthesis Papers"):
|
| 54 |
+
with st.spinner("Extracting text from PDFs..."):
|
| 55 |
+
# Extract text from PDFs
|
| 56 |
+
pdf_dataframe = get_txt_from_pdf(folder_path)
|
| 57 |
+
|
| 58 |
+
if not pdf_dataframe.empty:
|
| 59 |
+
st.write(f"Extracted text from {len(pdf_dataframe['file name'].unique())} PDFs")
|
| 60 |
+
st.write(f"Total pages processed: {len(pdf_dataframe)}")
|
| 61 |
+
|
| 62 |
+
# Process MOF synthesis papers
|
| 63 |
+
with st.spinner("Analyzing MOF Synthesis Papers..."):
|
| 64 |
+
processed_df = mof_synthesis_processing(pdf_dataframe)
|
| 65 |
+
|
| 66 |
+
if not processed_df.empty:
|
| 67 |
+
# Display results
|
| 68 |
+
st.success("MOF Synthesis Papers Processed Successfully!")
|
| 69 |
+
|
| 70 |
+
# Option to download processed data
|
| 71 |
+
csv = processed_df.to_csv(index=False)
|
| 72 |
+
st.download_button(
|
| 73 |
+
label="Download Processed Data",
|
| 74 |
+
data=csv,
|
| 75 |
+
file_name="mof_synthesis_data.csv",
|
| 76 |
+
mime="text/csv"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Display first few rows
|
| 80 |
+
st.dataframe(processed_df)
|
| 81 |
+
else:
|
| 82 |
+
st.warning("No MOF synthesis data was extracted.")
|
| 83 |
+
|
| 84 |
+
else:
|
| 85 |
+
st.error("No PDFs found or error in extracting text")
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
st.error(f"An error occurred: {e}")
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
main()
|