import streamlit as st import os import shutil import fitz import pandas as pd import easyocr from openai import OpenAI from dotenv import load_dotenv import ast load_dotenv() @st.cache_data def convert_df(df): # IMPORTANT: Cache the conversion to prevent computation on every rerun return df.to_csv().encode('utf-8') def list_files(directory): for root, dirs, files in os.walk(directory): for name in files: yield os.path.join(root, name) def correct_list(client, list_str): base_prompt = '''Above python list has syntax error. Correct the syntax without changing the values. Output should only be the corrected list. ''' prompt = list_str + base_prompt chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": prompt, } ], model="gpt-3.5-turbo", ) list_str_correct = chat_completion.choices[0].message.content return list_str_correct if os.path.exists('prediction') and os.path.isdir('prediction'): shutil.rmtree('prediction') if os.path.exists('temp_pdf') and os.path.isdir('temp_pdf'): shutil.rmtree('temp_pdf') # Check if the directory exists if not os.path.exists('temp_pdf'): # If it does not exist, create it os.makedirs('temp_pdf') print('not_found') else: print('found') temp_file_path = 'temp//temp.pdf' reader = easyocr.Reader(['en']) def main(): # Set the title of the app st.title("Transcript parser") credential = st.text_input('Credential') if credential is not '': st.markdown(credential) # credential = os.environ.get("OPENAI_API_KEY") # Create a file uploader to upload PDF files uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: # Create a temporary directory with open(temp_file_path, 'wb') as f: f.write(uploaded_file.getbuffer()) image_paths=[] input_path = "temp//temp.pdf" # st.markdown('### Images of detected tables') with st.spinner('Performing OCR...'): doc = fitz.open(input_path) zoom = 4 mat = fitz.Matrix(zoom, zoom) count = 0 context = '' for p in doc: count += 1 if count>4: count=4 st.error('Page limit exceeded. processing first 4 images') for i in range(count): st.markdown(f"Processing page {i+1}...") val = f"image_{i}.png" val = os.path.join('temp_pdf', val) page = doc.load_page(i) pix = page.get_pixmap(matrix=mat) pix.save(val) image_paths.append(val) text = reader.readtext(val, detail=0) context = context + ' '.join(text) doc.close() print(context) st.success('OCR completed') # context = '' # with st.spinner('Performing OCR on tables to extract images...'): # for image in image_paths: # text = reader.readtext(image, detail=0) # # print(text) # context = context + ' '.join(text) # print(context) # pass with st.spinner('Parsing extracted text...'): st.markdown('### Extracted data from transcripts') base_prompt='''Above is the OCR extracted transcript. Extract student's grade along with subject. Output should only be a lists of dict with course and grade as its keys. ''' base_prompt='''Above is the OCR extracted transcript. Extract student's points/scores along with subject. Output should only be a lists of dict with course and points/scores as its keys. ''' client = OpenAI( # This is the default and can be omitted api_key=credential, ) prompt = context + base_prompt chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": prompt, } ], model="gpt-3.5-turbo", ) list_str = chat_completion.choices[0].message.content print(list_str) try: actual_list = ast.literal_eval(list_str) except: list_str_correct = correct_list(client, list_str) actual_list = ast.literal_eval(list_str_correct) df = pd.DataFrame(columns=['Courses', 'Grade']) # Saving the keys in a variable (as a list) keys_list = list(actual_list[0].keys()) print(keys_list) # for subject in actual_list: # df.loc[len(df)] = [subject['course'], subject['grade']] for subject in actual_list: df.loc[len(df)] = [subject[keys_list[0]], subject[keys_list[1]]] st.dataframe(df) csv = convert_df(df) st.download_button( label="Download Parsed transcript", data=csv, file_name='transcript.csv', mime='text/csv', ) st.success('Transcript Processing Completed!') # Run the app if __name__ == "__main__": main()