Spaces:

darthPanda
/

table_detector

Runtime error

App Files Files Community

darthPanda commited on Dec 9, 2023

Commit

ce86dab

1 Parent(s): 53e11fa

transcript parser

Browse files

Files changed (2) hide show

app.py +153 -68
requirements.txt +3 -4

app.py CHANGED Viewed

@@ -1,13 +1,43 @@
 import streamlit as st
-import shutil
 import os
-from ultralytics import YOLO
 import shutil
 import fitz
 import pandas as pd
-import gdown
-import camelot
 if os.path.exists('prediction') and os.path.isdir('prediction'):
     shutil.rmtree('prediction')
@@ -23,74 +53,129 @@ if not os.path.exists('temp_pdf'):
 else:
     print('found')
-# Check if the directory exists
-if not os.path.exists('model'):
-    # If it does not exist, create it
-    os.makedirs('model')
-    url = "https://drive.google.com/uc?id=1zv3VDW-LXuesKLrTm6xSdKGrycutFdHb"
-    output = "model//best.pt"
-    gdown.download(url, output, quiet=False)
 temp_file_path = 'temp//temp.pdf'
-model = YOLO('model//best.pt')
 def main():
     # Set the title of the app
-    st.title("Table detection")
-    # Create a file uploader to upload PDF files
-    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
-    if uploaded_file is not None:
-        # Create a temporary directory
-        with open(temp_file_path, 'wb') as f:
-            f.write(uploaded_file.getbuffer())
-        inputpath = "temp//temp.pdf"
-        st.markdown('### Images of detected tables')
-        with st.spinner('Converting pdf to images...'):
-            doc = fitz.open(inputpath)
-            zoom = 4
-            mat = fitz.Matrix(zoom, zoom)
-            count = 0
-            for p in doc:
-                count += 1
-            for i in range(count):
-                val = f"image_{i}.png"
-                val = os.path.join('temp_pdf',val)
-                page = doc.load_page(i)
-                pix = page.get_pixmap(matrix=mat)
-                pix.save(val)
-            doc.close()
-        with st.spinner('Detecting table in images...'):
-            # for index, entry in enumerate(os.listdir('./temp.pdf_dir')):
-            for index, entry in enumerate(os.listdir('./temp_pdf')):
-                print(entry)
-                # Construct the full file path
-                full_path = os.path.join('temp_pdf', entry)
-                print(full_path)
-                results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
-                st.image(os.path.join(f'prediction//image_{index}',entry))
-        st.markdown('### Extracted data from tables')
-        with st.spinner('Performing OCR on tables to extract images...'):
-            # Extract tables from the PDF
-            tables = camelot.read_pdf(inputpath, pages='all', flavor='stream')
-            # Print the tables or convert them to a different format like CSV
-            for i, table in enumerate(tables):
-                st.dataframe(table.df)
-        st.success('Processing Completed!')
-        # st.image(os.listdir('temp.pdf_dir'))
 # Run the app
 if __name__ == "__main__":

 import streamlit as st
 import os
 import shutil
 import fitz
 import pandas as pd
+import easyocr
+from openai import OpenAI
+from dotenv import load_dotenv
+import ast
+load_dotenv()
+@st.cache_data
+def convert_df(df):
+    # IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df.to_csv().encode('utf-8')
+def list_files(directory):
+    for root, dirs, files in os.walk(directory):
+        for name in files:
+            yield os.path.join(root, name)
+def correct_list(client, list_str):
+    base_prompt = '''Above python list has syntax error.
+Correct the syntax without changing the values. Output should only be the corrected list.
+'''
+    prompt = list_str + base_prompt
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        model="gpt-3.5-turbo",
+    )
+    list_str_correct = chat_completion.choices[0].message.content
+    return list_str_correct
 if os.path.exists('prediction') and os.path.isdir('prediction'):
     shutil.rmtree('prediction')
 else:
     print('found')
 temp_file_path = 'temp//temp.pdf'
+reader = easyocr.Reader(['en'])
 def main():
     # Set the title of the app
+    st.title("Transcript parser")
+    credential = st.text_input('Credential')
+    if credential is not None:
+        # credential = os.environ.get("OPENAI_API_KEY")
+        # Create a file uploader to upload PDF files
+        uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+        if uploaded_file is not None:
+            # Create a temporary directory
+            with open(temp_file_path, 'wb') as f:
+                f.write(uploaded_file.getbuffer())
+            image_paths=[]
+            input_path = "temp//temp.pdf"
+            # st.markdown('### Images of detected tables')
+            with st.spinner('Performing OCR...'):
+                doc = fitz.open(input_path)
+                zoom = 4
+                mat = fitz.Matrix(zoom, zoom)
+                count = 0
+                context = ''
+                for p in doc:
+                    count += 1
+                if count>4:
+                    count=4
+                    st.error('Page limit exceeded. processing first 4 images')
+                for i in range(count):
+                    st.markdown(f"Processing page {i+1}...")
+                    val = f"image_{i}.png"
+                    val = os.path.join('temp_pdf', val)
+                    page = doc.load_page(i)
+                    pix = page.get_pixmap(matrix=mat)
+                    pix.save(val)
+                    image_paths.append(val)
+                    text = reader.readtext(val, detail=0)
+                    context = context + ' '.join(text)
+                doc.close()
+                print(context)
+            st.success('OCR completed')
+            # context = ''
+            # with st.spinner('Performing OCR on tables to extract images...'):
+            #     for image in image_paths:
+            #         text = reader.readtext(image, detail=0)
+            #         # print(text)
+            #         context = context + ' '.join(text)
+            #     print(context)
+            #     pass
+            with st.spinner('Parsing extracted text...'):
+                st.markdown('### Extracted data from transcripts')
+                base_prompt='''Above is the OCR extracted transcript.
+Extract student's grade along with subject. Output should only be a lists of dict with course and grade as its keys.
+'''
+                base_prompt='''Above is the OCR extracted transcript.
+Extract student's points/scores along with subject. Output should only be a lists of dict with course and points/scores as its keys.
+'''
+                client = OpenAI(
+                    # This is the default and can be omitted
+                    api_key=credential,
+                )
+                prompt = context + base_prompt
+                chat_completion = client.chat.completions.create(
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": prompt,
+                        }
+                    ],
+                    model="gpt-3.5-turbo",
+                )
+                list_str = chat_completion.choices[0].message.content
+                print(list_str)
+                try:
+                    actual_list = ast.literal_eval(list_str)
+                except:
+                    list_str_correct = correct_list(client, list_str)
+                    actual_list = ast.literal_eval(list_str_correct)
+                df = pd.DataFrame(columns=['Courses', 'Grade'])
+                # Saving the keys in a variable (as a list)
+                keys_list = list(actual_list[0].keys())
+                print(keys_list)
+                # for subject in actual_list:
+                #     df.loc[len(df)] = [subject['course'], subject['grade']]
+                for subject in actual_list:
+                    df.loc[len(df)] = [subject[keys_list[0]], subject[keys_list[1]]]
+                st.dataframe(df)
+            csv = convert_df(df)
+            st.download_button(
+                label="Download Parsed transcript",
+                data=csv,
+                file_name='transcript.csv',
+                mime='text/csv',
+            )
+            st.success('Transcript Processing Completed!')
 # Run the app
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 streamlit==1.25.0
-ultralytics
 PyMuPDF
-gdown
-camelot-py[cv]
-PyPDF2<3.0

 streamlit==1.25.0
 PyMuPDF
+easyocr
+openai
+python-dotenv