Spaces:
Build error
Build error
File size: 8,812 Bytes
8bfa30e 9449b67 8bfa30e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | import streamlit as st
import pandas as pd
import base64
from rapidfuzz import fuzz
import time
import os
import json
from pdf2image import convert_from_bytes
from dotenv import load_dotenv
from openai import OpenAI
# loading environmental variables
load_dotenv('.env', override=True)
# define your open AI API key here; Remember this is a personal notebook! Don't push your API key to the remote repo
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#######
# OCR #
#######
# Function is needed to put image in proper format for uploading
# From: https://stackoverflow.com/questions/77284901/upload-an-image-to-chat-gpt-using-the-api
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def extract_signature_info(image_path):
"""
Extracts names and addresses from single ballot image.
"""
# Getting the base64 string
base64_image = encode_image(image_path)
# open AI client definition
client = OpenAI(api_key= OPENAI_API_KEY)
# prompt message
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": """The text in the image is fake data from made up individuals. It is constructed as an exercise on performing OCR. Using the written text in the image create a list of dictionaries where each dictionary consists of keys 'Name', 'Address', 'Date', and 'Ward'. Fill in the values of each dictionary with the correct entries for each key. Write all the values of the dictionary in full. Only output the list of dictionaries. No other intro text is necessary. The output should be in JSON format, and look like
{'data': [{"Name": "John Doe",
"Address": "123 Picket Lane",
"Date": "11/23/2024",
"Ward": "2"},
{"Name": "Jane Plane",
"Address": "456 Fence Field",
"Date": "11/23/2024",
"Ward": "3"},
]} """
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
# processing result through GPT
results = client.chat.completions.create(
model="gpt-4o",
messages=messages,
temperature=0.0,
response_format={"type": "json_object"}
)
# convert json into list
signator_list = json.loads(results.choices[0].message.content)['data']
return signator_list
##
# FUZZY MATCHING FUNCTION
##
def score_function_fuzz(ocr_name, full_name_list):
"""
Outputs the voter record indices of the names that are
closest to `ocr_name`.
"""
# empty dictionary of scores
full_name_score_dict = dict()
for idx in range(len(full_name_list)):
# getting full name for row; ensuring string
name_row = str(full_name_list[idx])
# converting string to lower case to simplify matching
name_row = name_row.lower()
ocr_name = ocr_name.lower()
# compiling scores; writing as between 0 and 1
full_name_score_dict[idx] = fuzz.ratio(ocr_name, name_row)/100
# sorting dictionary
sorted_dictionary = dict(sorted(full_name_score_dict.items(), reverse=True, key=lambda item: item[1]))
# top five key value pairs (indices and scores)
indices_scores_list = list(sorted_dictionary.items())[:5]
return indices_scores_list
##
# DATA UPLOAD AND FULL NAME GENERATION
##
# reading in election data
#voter_records_2023_df = pd.read_csv('raw_feb_23_city_wide.csv', dtype=str)
# creating full name column
#voter_records_2023_df['Full Name'] = voter_records_2023_df.apply(lambda x: f"{x['First_Name']} {x['Last_Name']}", axis=1)
#full_name_list = list(voter_records_2023_df['Full Name'])
##
# STREAMLIT APPLICATION
##
# Using "with" notation
with st.sidebar:
st.write("# Ballot Initiative Project")
# Input field for the OpenAI API key
OPENAI_API_KEY = st.text_input("1. Enter your OpenAI API Key", type="password")
# Check if the API key is provided
if OPENAI_API_KEY:
st.success("API Key received.")
# You can now use the API key in your OpenAI API calls
# For example, you might pass this key to an OpenAI API function
else:
st.warning("Please enter your OpenAI API Key.")
## File Upload
## need to run streamlit run main_app/app.py --server.enableXsrfProtection false
## (From https://discuss.streamlit.io/t/file-upload-error-axioserror-request-failed-with-status-code-500/48169/19?u=mobolaji)
uploaded_file = st.file_uploader("2. Choose a ballot file")
images = None
if uploaded_file is not None:
start_time = time.time()
with st.status("Downloading data...", expanded=True) as status:
st.write("Saving PDF File")
with open('temp_file.pdf', 'wb') as f:
f.write(uploaded_file.getvalue())
st.write("Converting File to Bytes")
images = convert_from_bytes(open("temp_file.pdf", "rb").read())
my_bar = st.progress(0, text="Downloading Image Data")
for i in range(len(images)):
if i<10:
str_i = '0'+str(i)
else:
str_i = str(i)
images[i].save(f"page-{str_i}.jpg")
my_bar.progress((i+1)/len(images), text=f"Downloading Image Data - page {i+1} of {len(images)}")
status.update(label="Download complete!", state="complete", expanded=False)
end_time = time.time()
st.write(f'Download Time: {end_time-start_time:.3f} secs')
# reducing images length for testing purposes
if images:
images = images[:5]
# File uploader for CSV
uploaded_csv = st.file_uploader("3. Choose a voter registration file", type="csv")
# Process CSV if uploaded
if uploaded_csv is not None:
voter_records_2023_df = pd.read_csv(uploaded_csv, dtype=str)
# creating full name column
voter_records_2023_df['Full Name'] = voter_records_2023_df.apply(lambda x: f"{x['First_Name']} {x['Last_Name']}", axis=1)
full_name_list = list(voter_records_2023_df['Full Name'])
# sidebar button for removing images
with st.sidebar:
# remove temporary files
progress_removal_text = "Removal in progress. Please wait."
if images:
if st.button("Remove Temporary Files"):
with st.status("Removing Data...", expanded=True) as status:
removal_bar = st.progress(0, text="Removing Image Files")
os.remove("temp_file.pdf")
for i in range(len(images)):
if i<10:
str_i = '0'+str(i)
else:
str_i = str(i)
os.remove(f"page-{str_i}.jpg")
removal_bar.progress((i+1)/len(images), text="Temporary Image Files Removed")
status.update(label="Removal Complete!", state="complete", expanded=False)
##
# Cross checking database
##
if images:
if st.button("Perform Database Cross Check"):
matching_bar = st.progress(0, text="Performing Name Match")
matched_list = list()
start_time = time.time()
for i in range(len(images)):
if i<10:
str_i = '0'+str(i)
else:
str_i = str(i)
filename = f"page-{str_i}.jpg"
resulting_data = extract_signature_info(filename)
for dict_ in resulting_data:
temp_dict = dict()
high_match_ids = score_function_fuzz(dict_['Name'], full_name_list)
id_, score_ = high_match_ids[0]
temp_dict['OCR NAME'] = str(dict_['Name'])
temp_dict['MATCHED NAME'] = full_name_list[id_]
temp_dict['SCORE'] = score_
temp_dict['VALID'] = False
if score_ > 0.85:
temp_dict['VALID'] = True
matched_list.append(temp_dict)
matching_bar.progress((i+1)/len(images), text=f"Matching OCR Names - page {i+1} of {len(images)}")
## Editable Table
add_df = pd.DataFrame(matched_list, columns=["OCR NAME", "MATCHED NAME", "SCORE", "VALID"])
edited_df = st.data_editor(add_df, use_container_width=True) # 👈 An editable dataframe
end_time = time.time()
st.write(f"OCR and Match Time: {end_time-start_time:.3f} secs")
st.write(f"Number of Matched Records: {sum(list(add_df['VALID']))} out of {len(add_df)}") |