Spaces:

darthPanda
/

table_detector

Runtime error

File size: 5,755 Bytes

import streamlit as st
import os
import shutil
import fitz
import pandas as pd
import easyocr
from openai import OpenAI
from dotenv import load_dotenv
import ast

load_dotenv()

@st.cache_data
def convert_df(df):
    # IMPORTANT: Cache the conversion to prevent computation on every rerun
    return df.to_csv().encode('utf-8')

def list_files(directory):
    for root, dirs, files in os.walk(directory):
        for name in files:
            yield os.path.join(root, name)

def correct_list(client, list_str):
    base_prompt = '''Above python list has syntax error.
Correct the syntax without changing the values. Output should only be the corrected list.
'''
    prompt = list_str + base_prompt

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-3.5-turbo",
    )
    list_str_correct = chat_completion.choices[0].message.content

    return list_str_correct

if os.path.exists('prediction') and os.path.isdir('prediction'):
    shutil.rmtree('prediction')

if os.path.exists('temp_pdf') and os.path.isdir('temp_pdf'):
    shutil.rmtree('temp_pdf')

# Check if the directory exists
if not os.path.exists('temp_pdf'):
    # If it does not exist, create it
    os.makedirs('temp_pdf')
    print('not_found')
else:
    print('found')

temp_file_path = 'temp//temp.pdf'

reader = easyocr.Reader(['en'])

def main():
    # Set the title of the app
    st.title("Transcript parser")

    credential = st.text_input('Credential')

    if credential is not '':
        st.markdown(credential)
        # credential = os.environ.get("OPENAI_API_KEY")

        # Create a file uploader to upload PDF files
        uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

        if uploaded_file is not None:
            # Create a temporary directory

            with open(temp_file_path, 'wb') as f:
                f.write(uploaded_file.getbuffer())


            image_paths=[]
            input_path = "temp//temp.pdf"
            # st.markdown('### Images of detected tables')
            with st.spinner('Performing OCR...'):
                doc = fitz.open(input_path)
                zoom = 4
                mat = fitz.Matrix(zoom, zoom)
                count = 0
                context = ''
                for p in doc:
                    count += 1
                if count>4:
                    count=4
                    st.error('Page limit exceeded. processing first 4 images')
                for i in range(count):
                    st.markdown(f"Processing page {i+1}...")
                    val = f"image_{i}.png"
                    val = os.path.join('temp_pdf', val)
                    page = doc.load_page(i)
                    pix = page.get_pixmap(matrix=mat)
                    pix.save(val)
                    image_paths.append(val)
                    text = reader.readtext(val, detail=0)
                    context = context + ' '.join(text)
                doc.close()

                print(context)

            st.success('OCR completed')
            # context = ''
            # with st.spinner('Performing OCR on tables to extract images...'):
            #     for image in image_paths:
            #         text = reader.readtext(image, detail=0)
            #         # print(text)
            #         context = context + ' '.join(text)
            #     print(context)
            #     pass

            with st.spinner('Parsing extracted text...'):
                st.markdown('### Extracted data from transcripts')
                base_prompt='''Above is the OCR extracted transcript.
Extract student's grade along with subject. Output should only be a lists of dict with course and grade as its keys.
'''

                base_prompt='''Above is the OCR extracted transcript.
Extract student's points/scores along with subject. Output should only be a lists of dict with course and points/scores as its keys.
'''
                client = OpenAI(
                    # This is the default and can be omitted
                    api_key=credential,
                )

                prompt = context + base_prompt

                chat_completion = client.chat.completions.create(
                    messages=[
                        {
                            "role": "user",
                            "content": prompt,
                        }
                    ],
                    model="gpt-3.5-turbo",
                )

                list_str = chat_completion.choices[0].message.content

                print(list_str)

                try:
                    actual_list = ast.literal_eval(list_str)
                except:
                    list_str_correct = correct_list(client, list_str)
                    actual_list = ast.literal_eval(list_str_correct)

                df = pd.DataFrame(columns=['Courses', 'Grade'])

                # Saving the keys in a variable (as a list)
                keys_list = list(actual_list[0].keys())

                print(keys_list)

                # for subject in actual_list:
                #     df.loc[len(df)] = [subject['course'], subject['grade']]
                for subject in actual_list:
                    df.loc[len(df)] = [subject[keys_list[0]], subject[keys_list[1]]]

                st.dataframe(df)

            csv = convert_df(df)

            st.download_button(
                label="Download Parsed transcript",
                data=csv,
                file_name='transcript.csv',
                mime='text/csv',
            )


            st.success('Transcript Processing Completed!')

# Run the app
if __name__ == "__main__":
    main()