Spaces:
Runtime error
Runtime error
File size: 13,964 Bytes
4f7f1a1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 | import requests
import PyPDF2
import re
import os
import requests
import pandas as pd
import tiktoken
import time
from io import StringIO
from groq import Groq
import numpy as np
api_key='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'
client = Groq(api_key=api_key)
def count_tokens(text):
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding("cl100k_base")
num_tokens = len(encoding.encode(text))
return num_tokens
def get_pdf_files(folder_path):
"""
Retrieve PDF files from the specified folder path with improved error handling.
Args:
folder_path (str): Path to the folder containing PDF files
Returns:
list: List of full paths to PDF files
"""
# Validate folder path
if not os.path.exists(folder_path):
raise ValueError(f"Folder path does not exist: {folder_path}")
# List to store PDF file paths
pdf_files = []
# Walk through directory
for root, dirs, files in os.walk(folder_path):
for file in files:
# Check if file is a PDF
if file.lower().endswith('.pdf'):
full_path = os.path.join(root, file)
pdf_files.append(full_path)
# Check if any PDFs were found
if not pdf_files:
raise ValueError(f"No PDF files found in the folder: {folder_path}")
return pdf_files
def get_txt_from_pdf(pdf_files, filter_ref=False):
data = []
for pdf in pdf_files:
try:
with open(pdf, 'rb') as pdf_content:
pdf_reader = PyPDF2.PdfReader(pdf_content)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
words = page_text.split()
page_text_join = ' '.join(words)
if filter_ref:
page_text_join = remove_ref(page_text_join)
page_len = len(page_text_join)
div_len = page_len // 4 # Divide the page into 4 parts
page_parts = [page_text_join[i*div_len:(i+1)*div_len] for i in range(4)]
min_tokens = 40
for i, page_part in enumerate(page_parts):
if count_tokens(page_part) > min_tokens:
# Append the data to the list
data.append({
'file name': os.path.basename(pdf),
'page number': page_num + 1,
'page section': i+1,
'content': page_part,
'tokens': count_tokens(page_part)
})
except Exception as e:
print(f"Error processing {pdf}: {e}")
# Create a DataFrame from the data
df = pd.DataFrame(data)
return df
def remove_ref(pdf_text):
pattern = r'(REFERENCES|Acknowledgment|ACKNOWLEDGMENT)'
match = re.search(pattern, pdf_text)
if match:
# If a match is found, remove everything after the match
start_index = match.start()
clean_text = pdf_text[:start_index].strip()
else:
# Define a list of regular expression patterns for references
reference_patterns = [
'\[[\d\w]{1,3}\].+?[\d]{3,5}\.','\[[\d\w]{1,3}\].+?[\d]{3,5};','\([\d\w]{1,3}\).+?[\d]{3,5}\.','\[[\d\w]{1,3}\].+?[\d]{3,5},',
'\([\d\w]{1,3}\).+?[\d]{3,5},','\[[\d\w]{1,3}\].+?[\d]{3,5}','[\d\w]{1,3}\).+?[\d]{3,5}\.','[\d\w]{1,3}\).+?[\d]{3,5}',
'\([\d\w]{1,3}\).+?[\d]{3,5}','^[\w\d,\.– ;)-]+$',
]
# Find and remove matches with the first eight patterns
for pattern in reference_patterns[:8]:
matches = re.findall(pattern, pdf_text, flags=re.S)
pdf_text = re.sub(pattern, '', pdf_text) if len(matches) > 500 and matches.count('.') < 2 and matches.count(',') < 2 and not matches[-1].isdigit() else pdf_text
# Split the text into lines
lines = pdf_text.split('\n')
# Strip each line and remove matches with the last two patterns
for i, line in enumerate(lines):
lines[i] = line.strip()
for pattern in reference_patterns[7:]:
matches = re.findall(pattern, lines[i])
lines[i] = re.sub(pattern, '', lines[i]) if len(matches) > 500 and len(re.findall('\d', matches)) < 8 and len(set(matches)) > 10 and matches.count(',') < 2 and len(matches) > 20 else lines[i]
# Join the lines back together, excluding any empty lines
clean_text = '\n'.join([line for line in lines if line])
return clean_text
def split_content(input_string, tokens):
"""Splits a string into chunks based on a maximum token count. """
MAX_TOKENS = tokens
split_strings = []
current_string = ""
tokens_so_far = 0
for word in input_string.split():
# Check if adding the next word would exceed the max token limit
if tokens_so_far + count_tokens(word) > MAX_TOKENS:
# If we've reached the max tokens, look for the last dot or newline in the current string
last_dot = current_string.rfind(".")
last_newline = current_string.rfind("\n")
# Find the index to cut the current string
cut_index = max(last_dot, last_newline)
# If there's no dot or newline, we'll just cut at the max tokens
if cut_index == -1:
cut_index = MAX_TOKENS
# Add the substring to the result list and reset the current string and tokens_so_far
split_strings.append(current_string[:cut_index + 1].strip())
current_string = current_string[cut_index + 1:].strip()
tokens_so_far = count_tokens(current_string)
# Add the current word to the current string and update the token count
current_string += " " + word
tokens_so_far += count_tokens(word)
# Add the remaining current string to the result list
split_strings.append(current_string.strip())
return split_strings
def combine_section(df):
"""Merge sections, page numbers, add up content, and tokens based on the pdf name."""
aggregated_df = df.groupby('file name').agg({
'content': aggregate_content,
'tokens': aggregate_tokens
}).reset_index()
return aggregated_df
def combine_main_SI(df):
"""Create a new column with the main part of the file name, group the DataFrame by the new column,
and aggregate the content and tokens."""
df['main_part'] = df['file name'].apply(extract_title)
merged_df = df.groupby('main_part').agg({
'content': ''.join,
'tokens': sum
}).reset_index()
return merged_df.rename(columns={'main_part': 'file name'})
def aggregate_content(series):
"""Join all elements in the series with a space separator. """
return ' '.join(series)
def aggregate_tokens(series):
"""Sum all elements in the series."""
return series.sum()
def extract_title(file_name):
"""Extract the main part of the file name. """
title = file_name.split('_')[0]
return title.rstrip('.pdf')
def model_1(df):
"""Model 1 will turn text in dataframe to a summarized reaction condition table."""
# Initialize Groq client
response_msgs = []
for index, row in df.iterrows():
column1_value = row[df.columns[0]]
column2_value = row['content']
max_tokens = 3000
if count_tokens(column2_value) > max_tokens:
context_list = split_content(column2_value, max_tokens)
else:
context_list = [column2_value]
answers = '' # Collect answers from Groq
for context in context_list:
print("Start to analyze paper " + str(column1_value))
user_prompt = f"""This is an experimental section on MOF synthesis from paper {column1_value}
Context:
{context}
Q: Can you summarize the following details in a table:
compound name or chemical formula (if the name is not provided), metal source, metal amount, organic linker(s),
linker amount, modulator, modulator amount or volume, solvent(s), solvent volume(s), reaction temperature,
and reaction time?
Rules:
- If any information is not provided or you are unsure, use "N/A"
- Focus on extracting experimental conditions from only the MOF synthesis
- Ignore information related to organic linker synthesis, MOF postsynthetic modification, high throughput (HT) experiment details or catalysis reactions
- If multiple conditions are provided for the same compound, use multiple rows to represent them
- If multiple units or components are provided for the same factor (e.g., g and mol for the weight, multiple linker or metals, multiple temperature and reaction time, mixed solvents, etc), include them in the same cell and separate by comma
- The table should have 11 columns, all in lowercase:
| compound name | metal source | metal amount | linker | linker amount | modulator | modulator amount or volume | solvent | solvent volume | reaction temperature | reaction time |
Respond with ONLY the table."""
attempts = 3
while attempts > 0:
try:
response = client.chat.completions.create(
model="llama-3.1-70b-versatile", # or another available Groq model
messages=[
{"role": "system", "content": "You are a helpful assistant specialized in extracting MOF synthesis details."},
{"role": "user", "content": user_prompt}
]
)
answers_text = response.choices[0].message.content
# Check if response is valid
if answers_text and not answers_text.lower().startswith("i apologize"):
answers += '\n' + answers_text
break
else:
raise ValueError("Invalid or apologetic response")
except Exception as e:
attempts -= 1
if attempts <= 0:
print(f"Error: Failed to process paper {column1_value}. Skipping. (model 1)")
break
print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 1)")
time.sleep(60)
response_msgs.append(answers)
df = df.copy()
df.loc[:, 'summarized'] = response_msgs
return df
def model_2(df):
"""Model 2 identifies experiment sections and combines results"""
response_msgs = []
prev_paper_name = None
total_pages = df.groupby(df.columns[0])[df.columns[1]].max()
for _, row in df.iterrows():
paper_name = row[df.columns[0]]
page_number = row[df.columns[1]]
if paper_name != prev_paper_name:
print(f'Processing paper: {paper_name}. Total pages: {total_pages[paper_name]}')
prev_paper_name = paper_name
context = row['content']
user_prompt = """I will provide a context. Determine if the section contains a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes.
Examples:
1. Context: "In a 4-mL scintillation vial, the linker H2PZVDC (91.0 mg, 0.5 mmol, 1 equiv.) was dissolved in N,N-dimethylformamide (DMF) (0.6 mL) upon sonication."
Answer: Yes
2. Context: "Synthesis and Characterization of MOFs, Abbreviations, and General Procedures."
Answer: No
3. Context: "The design and synthesis of metal-organic frameworks (MOFs) has yielded a large number of structures"
Answer: No
Respond with only "Yes" or "No" based on the following context:
""" + context
attempts = 3
while attempts > 0:
try:
response = client.chat.completions.create(
model="llama-3.1-70b-versatile", # or another available Groq model
messages=[
{"role": "system", "content": "You are a helpful assistant specialized in identifying MOF synthesis sections."},
{"role": "user", "content": user_prompt}
]
)
answers = response.choices[0].message.content.strip()
# Validate response
if answers in ["Yes", "No"]:
break
else:
raise ValueError("Invalid response")
except Exception as e:
attempts -= 1
if attempts > 0:
print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 2)")
time.sleep(60)
else:
print(f"Error: Failed to process paper {paper_name}. Skipping. (model 2)")
answers = "No"
break
response_msgs.append(answers)
df = df.copy()
df.loc[:,'classification'] = response_msgs
# Remove consecutive "No" entries
mask_no = df["classification"].str.startswith("No")
mask_surrounded_by_no = mask_no.shift(1, fill_value=False) & mask_no.shift(-1, fill_value=False)
mask_to_remove = mask_no & mask_surrounded_by_no
filtered_df = df[~mask_to_remove]
# Combine sections and process
combined_df = combine_main_SI(combine_section(filtered_df))
add_table_df = model_1(combined_df)
return add_table_df[['file name','summarized']]
|