chiichann commited on
Commit
db40c43
·
verified ·
1 Parent(s): 5155ce2

Upload 2 files

Browse files
Files changed (2) hide show
  1. apps.py +173 -0
  2. requirements.txt +10 -0
apps.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ import pytesseract
4
+ import io
5
+ import fitz # PyMuPDF
6
+ import cv2
7
+ import numpy as np
8
+ import requests
9
+ from transformers import pipeline
10
+ from difflib import SequenceMatcher
11
+ import folium
12
+ from streamlit_folium import st_folium
13
+ import wikipedia
14
+
15
+ # Load summarization and NER pipeline
16
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
+ ner_pipeline = pipeline("ner", aggregation_strategy="simple")
18
+
19
+ # Streamlit App
20
+ st.set_page_config(page_title="AI Historical Document Decipher", layout="wide")
21
+ st.title("📜 AI-powered Historical Document Deciphering App")
22
+
23
+ st.sidebar.header("Upload Document")
24
+ uploaded_file = st.sidebar.file_uploader("Upload Image or PDF", type=["jpg", "jpeg", "png", "pdf"])
25
+
26
+ # Function to convert PDF to image
27
+ def pdf_to_images(pdf_bytes):
28
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
29
+ images = []
30
+ for page in doc:
31
+ pix = page.get_pixmap()
32
+ img = Image.open(io.BytesIO(pix.tobytes()))
33
+ images.append(img)
34
+ return images
35
+
36
+ # Function to enhance image
37
+ def enhance_image(image):
38
+ img = np.array(image.convert('RGB'))
39
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
40
+
41
+ # Denoise
42
+ denoised = cv2.fastNlMeansDenoising(gray, h=30)
43
+
44
+ # Sharpening
45
+ kernel = np.array([[0, -1, 0],
46
+ [-1, 5,-1],
47
+ [0, -1, 0]])
48
+ sharpened = cv2.filter2D(denoised, -1, kernel)
49
+
50
+ # Thresholding (binarization)
51
+ _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
52
+
53
+ # Optional: Resize (sometimes helps OCR)
54
+ scale_percent = 150 # percent of original size
55
+ width = int(binary.shape[1] * scale_percent / 100)
56
+ height = int(binary.shape[0] * scale_percent / 100)
57
+ resized = cv2.resize(binary, (width, height), interpolation=cv2.INTER_CUBIC)
58
+
59
+ return resized
60
+
61
+ # Function to perform OCR
62
+ def perform_ocr(image):
63
+ custom_oem_psm_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
64
+ text = pytesseract.image_to_string(image, config=custom_oem_psm_config)
65
+ return text
66
+
67
+ # Function to extract named entities
68
+ def extract_entities(text):
69
+ entities = ner_pipeline(text)
70
+ extracted = {}
71
+ for ent in entities:
72
+ label = ent['entity_group']
73
+ extracted.setdefault(label, set()).add(ent['word'])
74
+ return extracted
75
+
76
+ def get_historical_context(entities):
77
+ context = {}
78
+ for label, values in entities.items():
79
+ for item in values:
80
+ try:
81
+ summary = wikipedia.summary(item, sentences=2)
82
+ context[item] = summary
83
+ except wikipedia.exceptions.DisambiguationError as e:
84
+ context[item] = f"Multiple entries found for '{item}': {e.options[:3]}"
85
+ except wikipedia.exceptions.PageError:
86
+ context[item] = f"No historical info found for '{item}'."
87
+ except Exception as e:
88
+ context[item] = f"Error retrieving info: {e}"
89
+ return context
90
+
91
+ # Function to correct OCR errors (suggestions)
92
+ def suggest_corrections(original_text):
93
+ words = original_text.split()
94
+ suggestions = {}
95
+ for word in words:
96
+ if len(word) > 4 and not word.isnumeric():
97
+ close_matches = [w for w in ["document", "historical", "archive", "event", "location"] if SequenceMatcher(None, word.lower(), w).ratio() > 0.75]
98
+ if close_matches:
99
+ suggestions[word] = close_matches[0]
100
+ return suggestions
101
+
102
+ # Function to generate map
103
+ def generate_map(entities):
104
+ m = folium.Map(location=[20, 0], zoom_start=2)
105
+ if "LOC" in entities:
106
+ for location in entities["LOC"]:
107
+ # Dummy coordinates for demonstration
108
+ folium.Marker(
109
+ location=[51.5074, -0.1278], # Example: London
110
+ popup=f"Location: {location}",
111
+ tooltip=location
112
+ ).add_to(m)
113
+ return m
114
+
115
+ if uploaded_file:
116
+ file_type = uploaded_file.type
117
+
118
+ # Display and process the uploaded document
119
+ if file_type == "application/pdf":
120
+ images = pdf_to_images(uploaded_file.read())
121
+ else:
122
+ images = [Image.open(uploaded_file)]
123
+
124
+ for image in images:
125
+ st.image(image, caption="Uploaded Document", use_container_width=True)
126
+
127
+ # Enhance image
128
+ enhanced = enhance_image(image)
129
+ st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY")
130
+
131
+ # Perform OCR
132
+ ocr_text = perform_ocr(enhanced)
133
+ st.subheader("Extracted Text (OCR)")
134
+ st.text_area("Text", ocr_text, height=200)
135
+
136
+ # Suggest corrections
137
+ corrections = suggest_corrections(ocr_text)
138
+ if corrections:
139
+ st.subheader("AI Suggestions for Possible Corrections")
140
+ for original, suggestion in corrections.items():
141
+ st.markdown(f"**{original}** ➔ *{suggestion}*")
142
+
143
+ # Summarize text
144
+ if len(ocr_text.strip()) > 50:
145
+ summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
146
+ st.subheader("Summary")
147
+ st.write(summary)
148
+
149
+ # Extract entities
150
+ entities = extract_entities(ocr_text)
151
+ st.subheader("Key Information")
152
+ for label, items in entities.items():
153
+ st.markdown(f"**{label}**: {', '.join(items)}")
154
+
155
+ # Provide historical context
156
+ context = get_historical_context(entities)
157
+ if context:
158
+ st.subheader("Historical Context & Insights")
159
+ for item, info in context.items():
160
+ st.markdown(f"- **{item}**: {info}")
161
+
162
+ # Visualize map
163
+ st.subheader("Locations Mentioned")
164
+ map_ = generate_map(entities)
165
+ st_folium(map_, width=700)
166
+
167
+ st.markdown("---")
168
+
169
+ else:
170
+ st.info("Upload an image or PDF of a historical document to begin.")
171
+
172
+ st.sidebar.markdown("---")
173
+ st.sidebar.markdown("Developed by **Cherilyn Marie Deocampo**")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.41.1
2
+ Pillow==10.2.0
3
+ easyocr==1.7.1
4
+ PyMuPDF==1.25.5
5
+ opencv-python==4.11.0.86
6
+ numpy==1.26.4
7
+ transformers==4.49.0
8
+ folium==0.19.5
9
+ streamlit-folium==0.24.0
10
+ wikipedia==1.4.0