chiichann commited on
Commit
48f9fac
·
verified ·
1 Parent(s): 586280c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -21
app.py CHANGED
@@ -11,7 +11,13 @@ from difflib import SequenceMatcher
11
  import folium
12
  from streamlit_folium import st_folium
13
  import wikipediaapi
 
 
14
 
 
 
 
 
15
  wiki_wiki = wikipediaapi.Wikipedia(
16
  language='en',
17
  user_agent='AI-Historical-Doc-App/1.0 (contact: cherilynmarie.deocampo@wvsu.edu.com)'
@@ -51,14 +57,14 @@ def enhance_image(image):
51
 
52
  # Sharpening
53
  kernel = np.array([[0, -1, 0],
54
- [-1, 5,-1],
55
  [0, -1, 0]])
56
  sharpened = cv2.filter2D(denoised, -1, kernel)
57
 
58
  # Thresholding (binarization)
59
  _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
60
 
61
- # Optional: Resize (sometimes helps OCR)
62
  scale_percent = 150
63
  width = int(binary.shape[1] * scale_percent / 100)
64
  height = int(binary.shape[0] * scale_percent / 100)
@@ -66,7 +72,7 @@ def enhance_image(image):
66
 
67
  return resized
68
 
69
- # Function to perform OCR using EasyOCR
70
  def perform_ocr(image):
71
  if isinstance(image, np.ndarray):
72
  img_array = image
@@ -77,7 +83,7 @@ def perform_ocr(image):
77
  text = '\n'.join(results)
78
  return text
79
 
80
- # Function to extract named entities
81
  def extract_entities(text):
82
  entities = ner_pipeline(text)
83
  extracted = {}
@@ -86,18 +92,28 @@ def extract_entities(text):
86
  extracted.setdefault(label, set()).add(ent['word'])
87
  return extracted
88
 
 
 
 
 
 
89
  def get_historical_context(entities):
90
  context = {}
91
  for label, values in entities.items():
92
  for item in values:
93
- page = wiki_wiki.page(item)
94
- if page.exists():
95
- context[item] = page.summary[:500] # Limit summary length
96
- else:
97
- context[item] = f"No historical info found for '{item}'."
 
 
 
 
 
98
  return context
99
 
100
- # Function to correct OCR errors (suggestions)
101
  def suggest_corrections(original_text):
102
  words = original_text.split()
103
  suggestions = {}
@@ -109,23 +125,23 @@ def suggest_corrections(original_text):
109
  suggestions[word] = close_matches[0]
110
  return suggestions
111
 
112
- # Function to generate map
113
  def generate_map(entities):
114
  m = folium.Map(location=[20, 0], zoom_start=2)
115
  if "LOC" in entities:
116
  for location in entities["LOC"]:
117
- # Dummy coordinates for demonstration
118
  folium.Marker(
119
- location=[51.5074, -0.1278], # Example: London
120
  popup=f"Location: {location}",
121
  tooltip=location
122
  ).add_to(m)
123
  return m
124
 
 
125
  if uploaded_file:
126
  file_type = uploaded_file.type
127
 
128
- # Display and process the uploaded document
129
  if file_type == "application/pdf":
130
  images = pdf_to_images(uploaded_file.read())
131
  else:
@@ -134,42 +150,35 @@ if uploaded_file:
134
  for image in images:
135
  st.image(image, caption="Uploaded Document", use_container_width=True)
136
 
137
- # Enhance image
138
  enhanced = enhance_image(image)
139
  st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY")
140
 
141
- # Perform OCR
142
  ocr_text = perform_ocr(enhanced)
143
  st.subheader("Extracted Text (OCR)")
144
  st.text_area("Text", ocr_text, height=200)
145
 
146
- # Suggest corrections
147
  corrections = suggest_corrections(ocr_text)
148
  if corrections:
149
  st.subheader("AI Suggestions for Possible Corrections")
150
  for original, suggestion in corrections.items():
151
  st.markdown(f"**{original}** ➔ *{suggestion}*")
152
 
153
- # Summarize text
154
  if len(ocr_text.strip()) > 50:
155
  summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
156
  st.subheader("Summary")
157
  st.write(summary)
158
 
159
- # Extract entities
160
  entities = extract_entities(ocr_text)
161
  st.subheader("Key Information")
162
  for label, items in entities.items():
163
  st.markdown(f"**{label}**: {', '.join(items)}")
164
 
165
- # Provide historical context
166
  context = get_historical_context(entities)
167
  if context:
168
  st.subheader("Historical Context & Insights")
169
  for item, info in context.items():
170
  st.markdown(f"- **{item}**: {info}")
171
 
172
- # Visualize map
173
  st.subheader("Locations Mentioned")
174
  map_ = generate_map(entities)
175
  st_folium(map_, width=700)
 
11
  import folium
12
  from streamlit_folium import st_folium
13
  import wikipediaapi
14
+ import logging
15
+ import re
16
 
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
+ # Wikipedia API setup
21
  wiki_wiki = wikipediaapi.Wikipedia(
22
  language='en',
23
  user_agent='AI-Historical-Doc-App/1.0 (contact: cherilynmarie.deocampo@wvsu.edu.com)'
 
57
 
58
  # Sharpening
59
  kernel = np.array([[0, -1, 0],
60
+ [-1, 5, -1],
61
  [0, -1, 0]])
62
  sharpened = cv2.filter2D(denoised, -1, kernel)
63
 
64
  # Thresholding (binarization)
65
  _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
66
 
67
+ # Resize
68
  scale_percent = 150
69
  width = int(binary.shape[1] * scale_percent / 100)
70
  height = int(binary.shape[0] * scale_percent / 100)
 
72
 
73
  return resized
74
 
75
+ # OCR
76
  def perform_ocr(image):
77
  if isinstance(image, np.ndarray):
78
  img_array = image
 
83
  text = '\n'.join(results)
84
  return text
85
 
86
+ # Extract named entities
87
  def extract_entities(text):
88
  entities = ner_pipeline(text)
89
  extracted = {}
 
92
  extracted.setdefault(label, set()).add(ent['word'])
93
  return extracted
94
 
95
+ # Clean extracted entities for Wikipedia
96
+ def clean_entity(text):
97
+ return re.sub(r"[^\w\s]", "", text).strip()
98
+
99
+ # Historical context fetcher
100
  def get_historical_context(entities):
101
  context = {}
102
  for label, values in entities.items():
103
  for item in values:
104
+ cleaned_item = clean_entity(item)
105
+ try:
106
+ page = wiki_wiki.page(cleaned_item)
107
+ if page.exists():
108
+ context[item] = page.summary[:500] # Limit summary
109
+ else:
110
+ context[item] = f"No historical info found for '{item}'."
111
+ except Exception as e:
112
+ logging.warning(f"Wikipedia lookup failed for '{item}': {e}")
113
+ context[item] = f"Error fetching data for '{item}': {e}"
114
  return context
115
 
116
+ # Suggest corrections
117
  def suggest_corrections(original_text):
118
  words = original_text.split()
119
  suggestions = {}
 
125
  suggestions[word] = close_matches[0]
126
  return suggestions
127
 
128
+ # Generate map
129
  def generate_map(entities):
130
  m = folium.Map(location=[20, 0], zoom_start=2)
131
  if "LOC" in entities:
132
  for location in entities["LOC"]:
133
+ # Dummy coordinates
134
  folium.Marker(
135
+ location=[51.5074, -0.1278],
136
  popup=f"Location: {location}",
137
  tooltip=location
138
  ).add_to(m)
139
  return m
140
 
141
+ # Main process
142
  if uploaded_file:
143
  file_type = uploaded_file.type
144
 
 
145
  if file_type == "application/pdf":
146
  images = pdf_to_images(uploaded_file.read())
147
  else:
 
150
  for image in images:
151
  st.image(image, caption="Uploaded Document", use_container_width=True)
152
 
 
153
  enhanced = enhance_image(image)
154
  st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY")
155
 
 
156
  ocr_text = perform_ocr(enhanced)
157
  st.subheader("Extracted Text (OCR)")
158
  st.text_area("Text", ocr_text, height=200)
159
 
 
160
  corrections = suggest_corrections(ocr_text)
161
  if corrections:
162
  st.subheader("AI Suggestions for Possible Corrections")
163
  for original, suggestion in corrections.items():
164
  st.markdown(f"**{original}** ➔ *{suggestion}*")
165
 
 
166
  if len(ocr_text.strip()) > 50:
167
  summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
168
  st.subheader("Summary")
169
  st.write(summary)
170
 
 
171
  entities = extract_entities(ocr_text)
172
  st.subheader("Key Information")
173
  for label, items in entities.items():
174
  st.markdown(f"**{label}**: {', '.join(items)}")
175
 
 
176
  context = get_historical_context(entities)
177
  if context:
178
  st.subheader("Historical Context & Insights")
179
  for item, info in context.items():
180
  st.markdown(f"- **{item}**: {info}")
181
 
 
182
  st.subheader("Locations Mentioned")
183
  map_ = generate_map(entities)
184
  st_folium(map_, width=700)