chiichann commited on
Commit
854b9f6
·
verified ·
1 Parent(s): 1046682

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -14
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  from PIL import Image
3
- import pytesseract
4
  import io
5
  import fitz # PyMuPDF
6
  import cv2
@@ -10,14 +10,15 @@ from transformers import pipeline
10
  from difflib import SequenceMatcher
11
  import folium
12
  from streamlit_folium import st_folium
13
- import wikipediaapi
14
-
15
- pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
16
 
17
  # Load summarization and NER pipeline
18
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
19
  ner_pipeline = pipeline("ner", aggregation_strategy="simple")
20
 
 
 
 
21
  # Streamlit App
22
  st.set_page_config(page_title="AI Historical Document Decipher", layout="wide")
23
  st.title("📜 AI-powered Historical Document Deciphering App")
@@ -53,17 +54,22 @@ def enhance_image(image):
53
  _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
54
 
55
  # Optional: Resize (sometimes helps OCR)
56
- scale_percent = 150 # percent of original size
57
  width = int(binary.shape[1] * scale_percent / 100)
58
  height = int(binary.shape[0] * scale_percent / 100)
59
  resized = cv2.resize(binary, (width, height), interpolation=cv2.INTER_CUBIC)
60
 
61
  return resized
62
 
63
- # Function to perform OCR
64
  def perform_ocr(image):
65
- custom_oem_psm_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
66
- text = pytesseract.image_to_string(image, config=custom_oem_psm_config)
 
 
 
 
 
67
  return text
68
 
69
  # Function to extract named entities
@@ -76,15 +82,18 @@ def extract_entities(text):
76
  return extracted
77
 
78
  def get_historical_context(entities):
79
- wiki_wiki = wikipediaapi.Wikipedia('en')
80
  context = {}
81
  for label, values in entities.items():
82
  for item in values:
83
- page = wiki_wiki.page(item)
84
- if page.exists():
85
- context[item] = page.summary[0:500] # get first 500 characters
86
- else:
 
 
87
  context[item] = f"No historical info found for '{item}'."
 
 
88
  return context
89
 
90
  # Function to correct OCR errors (suggestions)
@@ -93,7 +102,8 @@ def suggest_corrections(original_text):
93
  suggestions = {}
94
  for word in words:
95
  if len(word) > 4 and not word.isnumeric():
96
- close_matches = [w for w in ["document", "historical", "archive", "event", "location"] if SequenceMatcher(None, word.lower(), w).ratio() > 0.75]
 
97
  if close_matches:
98
  suggestions[word] = close_matches[0]
99
  return suggestions
 
1
  import streamlit as st
2
  from PIL import Image
3
+ import easyocr
4
  import io
5
  import fitz # PyMuPDF
6
  import cv2
 
10
  from difflib import SequenceMatcher
11
  import folium
12
  from streamlit_folium import st_folium
13
+ import wikipedia
 
 
14
 
15
  # Load summarization and NER pipeline
16
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
  ner_pipeline = pipeline("ner", aggregation_strategy="simple")
18
 
19
+ # Initialize EasyOCR reader
20
+ reader = easyocr.Reader(['en'], gpu=False)
21
+
22
  # Streamlit App
23
  st.set_page_config(page_title="AI Historical Document Decipher", layout="wide")
24
  st.title("📜 AI-powered Historical Document Deciphering App")
 
54
  _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
55
 
56
  # Optional: Resize (sometimes helps OCR)
57
+ scale_percent = 150
58
  width = int(binary.shape[1] * scale_percent / 100)
59
  height = int(binary.shape[0] * scale_percent / 100)
60
  resized = cv2.resize(binary, (width, height), interpolation=cv2.INTER_CUBIC)
61
 
62
  return resized
63
 
64
+ # Function to perform OCR using EasyOCR
65
  def perform_ocr(image):
66
+ if isinstance(image, np.ndarray):
67
+ img_array = image
68
+ else:
69
+ img_array = np.array(image.convert('RGB'))
70
+
71
+ results = reader.readtext(img_array, detail=0)
72
+ text = '\n'.join(results)
73
  return text
74
 
75
  # Function to extract named entities
 
82
  return extracted
83
 
84
  def get_historical_context(entities):
 
85
  context = {}
86
  for label, values in entities.items():
87
  for item in values:
88
+ try:
89
+ summary = wikipedia.summary(item, sentences=2)
90
+ context[item] = summary
91
+ except wikipedia.exceptions.DisambiguationError as e:
92
+ context[item] = f"Multiple entries found for '{item}': {e.options[:3]}"
93
+ except wikipedia.exceptions.PageError:
94
  context[item] = f"No historical info found for '{item}'."
95
+ except Exception as e:
96
+ context[item] = f"Error retrieving info: {e}"
97
  return context
98
 
99
  # Function to correct OCR errors (suggestions)
 
102
  suggestions = {}
103
  for word in words:
104
  if len(word) > 4 and not word.isnumeric():
105
+ close_matches = [w for w in ["document", "historical", "archive", "event", "location"]
106
+ if SequenceMatcher(None, word.lower(), w).ratio() > 0.75]
107
  if close_matches:
108
  suggestions[word] = close_matches[0]
109
  return suggestions