darthPanda commited on
Commit
ce86dab
·
1 Parent(s): 53e11fa

transcript parser

Browse files
Files changed (2) hide show
  1. app.py +153 -68
  2. requirements.txt +3 -4
app.py CHANGED
@@ -1,13 +1,43 @@
1
  import streamlit as st
2
- import shutil
3
  import os
4
- from ultralytics import YOLO
5
  import shutil
6
  import fitz
7
  import pandas as pd
8
- import gdown
9
- import camelot
10
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  if os.path.exists('prediction') and os.path.isdir('prediction'):
13
  shutil.rmtree('prediction')
@@ -23,74 +53,129 @@ if not os.path.exists('temp_pdf'):
23
  else:
24
  print('found')
25
 
26
- # Check if the directory exists
27
- if not os.path.exists('model'):
28
- # If it does not exist, create it
29
- os.makedirs('model')
30
- url = "https://drive.google.com/uc?id=1zv3VDW-LXuesKLrTm6xSdKGrycutFdHb"
31
- output = "model//best.pt"
32
- gdown.download(url, output, quiet=False)
33
-
34
  temp_file_path = 'temp//temp.pdf'
35
 
36
- model = YOLO('model//best.pt')
37
 
38
  def main():
39
  # Set the title of the app
40
- st.title("Table detection")
41
-
42
- # Create a file uploader to upload PDF files
43
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
44
-
45
- if uploaded_file is not None:
46
- # Create a temporary directory
47
-
48
- with open(temp_file_path, 'wb') as f:
49
- f.write(uploaded_file.getbuffer())
50
-
51
-
52
- inputpath = "temp//temp.pdf"
53
- st.markdown('### Images of detected tables')
54
- with st.spinner('Converting pdf to images...'):
55
- doc = fitz.open(inputpath)
56
- zoom = 4
57
- mat = fitz.Matrix(zoom, zoom)
58
- count = 0
59
- for p in doc:
60
- count += 1
61
- for i in range(count):
62
- val = f"image_{i}.png"
63
- val = os.path.join('temp_pdf',val)
64
- page = doc.load_page(i)
65
- pix = page.get_pixmap(matrix=mat)
66
- pix.save(val)
67
- doc.close()
68
-
69
-
70
- with st.spinner('Detecting table in images...'):
71
-
72
- # for index, entry in enumerate(os.listdir('./temp.pdf_dir')):
73
- for index, entry in enumerate(os.listdir('./temp_pdf')):
74
- print(entry)
75
- # Construct the full file path
76
- full_path = os.path.join('temp_pdf', entry)
77
- print(full_path)
78
- results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
79
- st.image(os.path.join(f'prediction//image_{index}',entry))
80
-
81
- st.markdown('### Extracted data from tables')
82
-
83
- with st.spinner('Performing OCR on tables to extract images...'):
84
- # Extract tables from the PDF
85
- tables = camelot.read_pdf(inputpath, pages='all', flavor='stream')
86
-
87
- # Print the tables or convert them to a different format like CSV
88
- for i, table in enumerate(tables):
89
- st.dataframe(table.df)
90
-
91
- st.success('Processing Completed!')
92
-
93
- # st.image(os.listdir('temp.pdf_dir'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # Run the app
96
  if __name__ == "__main__":
 
1
  import streamlit as st
 
2
  import os
 
3
  import shutil
4
  import fitz
5
  import pandas as pd
6
+ import easyocr
7
+ from openai import OpenAI
8
+ from dotenv import load_dotenv
9
+ import ast
10
+
11
+ load_dotenv()
12
+
13
+ @st.cache_data
14
+ def convert_df(df):
15
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
16
+ return df.to_csv().encode('utf-8')
17
+
18
+ def list_files(directory):
19
+ for root, dirs, files in os.walk(directory):
20
+ for name in files:
21
+ yield os.path.join(root, name)
22
+
23
+ def correct_list(client, list_str):
24
+ base_prompt = '''Above python list has syntax error.
25
+ Correct the syntax without changing the values. Output should only be the corrected list.
26
+ '''
27
+ prompt = list_str + base_prompt
28
+
29
+ chat_completion = client.chat.completions.create(
30
+ messages=[
31
+ {
32
+ "role": "user",
33
+ "content": prompt,
34
+ }
35
+ ],
36
+ model="gpt-3.5-turbo",
37
+ )
38
+ list_str_correct = chat_completion.choices[0].message.content
39
+
40
+ return list_str_correct
41
 
42
  if os.path.exists('prediction') and os.path.isdir('prediction'):
43
  shutil.rmtree('prediction')
 
53
  else:
54
  print('found')
55
 
 
 
 
 
 
 
 
 
56
  temp_file_path = 'temp//temp.pdf'
57
 
58
+ reader = easyocr.Reader(['en'])
59
 
60
  def main():
61
  # Set the title of the app
62
+ st.title("Transcript parser")
63
+
64
+ credential = st.text_input('Credential')
65
+
66
+ if credential is not None:
67
+
68
+ # credential = os.environ.get("OPENAI_API_KEY")
69
+
70
+ # Create a file uploader to upload PDF files
71
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
72
+
73
+ if uploaded_file is not None:
74
+ # Create a temporary directory
75
+
76
+ with open(temp_file_path, 'wb') as f:
77
+ f.write(uploaded_file.getbuffer())
78
+
79
+
80
+ image_paths=[]
81
+ input_path = "temp//temp.pdf"
82
+ # st.markdown('### Images of detected tables')
83
+ with st.spinner('Performing OCR...'):
84
+ doc = fitz.open(input_path)
85
+ zoom = 4
86
+ mat = fitz.Matrix(zoom, zoom)
87
+ count = 0
88
+ context = ''
89
+ for p in doc:
90
+ count += 1
91
+ if count>4:
92
+ count=4
93
+ st.error('Page limit exceeded. processing first 4 images')
94
+ for i in range(count):
95
+ st.markdown(f"Processing page {i+1}...")
96
+ val = f"image_{i}.png"
97
+ val = os.path.join('temp_pdf', val)
98
+ page = doc.load_page(i)
99
+ pix = page.get_pixmap(matrix=mat)
100
+ pix.save(val)
101
+ image_paths.append(val)
102
+ text = reader.readtext(val, detail=0)
103
+ context = context + ' '.join(text)
104
+ doc.close()
105
+
106
+ print(context)
107
+
108
+ st.success('OCR completed')
109
+ # context = ''
110
+ # with st.spinner('Performing OCR on tables to extract images...'):
111
+ # for image in image_paths:
112
+ # text = reader.readtext(image, detail=0)
113
+ # # print(text)
114
+ # context = context + ' '.join(text)
115
+ # print(context)
116
+ # pass
117
+
118
+ with st.spinner('Parsing extracted text...'):
119
+ st.markdown('### Extracted data from transcripts')
120
+ base_prompt='''Above is the OCR extracted transcript.
121
+ Extract student's grade along with subject. Output should only be a lists of dict with course and grade as its keys.
122
+ '''
123
+
124
+ base_prompt='''Above is the OCR extracted transcript.
125
+ Extract student's points/scores along with subject. Output should only be a lists of dict with course and points/scores as its keys.
126
+ '''
127
+ client = OpenAI(
128
+ # This is the default and can be omitted
129
+ api_key=credential,
130
+ )
131
+
132
+ prompt = context + base_prompt
133
+
134
+ chat_completion = client.chat.completions.create(
135
+ messages=[
136
+ {
137
+ "role": "user",
138
+ "content": prompt,
139
+ }
140
+ ],
141
+ model="gpt-3.5-turbo",
142
+ )
143
+
144
+ list_str = chat_completion.choices[0].message.content
145
+
146
+ print(list_str)
147
+
148
+ try:
149
+ actual_list = ast.literal_eval(list_str)
150
+ except:
151
+ list_str_correct = correct_list(client, list_str)
152
+ actual_list = ast.literal_eval(list_str_correct)
153
+
154
+ df = pd.DataFrame(columns=['Courses', 'Grade'])
155
+
156
+ # Saving the keys in a variable (as a list)
157
+ keys_list = list(actual_list[0].keys())
158
+
159
+ print(keys_list)
160
+
161
+ # for subject in actual_list:
162
+ # df.loc[len(df)] = [subject['course'], subject['grade']]
163
+ for subject in actual_list:
164
+ df.loc[len(df)] = [subject[keys_list[0]], subject[keys_list[1]]]
165
+
166
+ st.dataframe(df)
167
+
168
+ csv = convert_df(df)
169
+
170
+ st.download_button(
171
+ label="Download Parsed transcript",
172
+ data=csv,
173
+ file_name='transcript.csv',
174
+ mime='text/csv',
175
+ )
176
+
177
+
178
+ st.success('Transcript Processing Completed!')
179
 
180
  # Run the app
181
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  streamlit==1.25.0
2
- ultralytics
3
  PyMuPDF
4
- gdown
5
- camelot-py[cv]
6
- PyPDF2<3.0
 
1
  streamlit==1.25.0
 
2
  PyMuPDF
3
+ easyocr
4
+ openai
5
+ python-dotenv