Spaces:
Sleeping
Sleeping
Michela commited on
Commit Β·
559c653
1
Parent(s): a9cd115
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Import packages
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
|
@@ -23,18 +28,84 @@ data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed
|
|
| 23 |
# Pagination settings
|
| 24 |
R = 5 # Number of preview rows per page
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def highlight_text(text, highlights):
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if isinstance(highlights, str):
|
| 30 |
highlights = [highlights]
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
for highlight in highlights:
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
return text
|
| 36 |
|
| 37 |
-
|
| 38 |
# Function to create preview rows
|
| 39 |
def preview_results(page, selected_data_source):
|
| 40 |
data_source = data_sources[selected_data_source]
|
|
@@ -70,9 +141,9 @@ def show_details(document_name, selected_data_source):
|
|
| 70 |
return f"""
|
| 71 |
<div style="display: flex; justify-content: space-between; align-items: start;">
|
| 72 |
<div style="width: 65%; font-size: 18px;">
|
| 73 |
-
<h3>π Preview: {
|
| 74 |
<p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
|
| 75 |
-
<p><b>
|
| 76 |
<p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">π Open ΓNB Viewer</a></p>
|
| 77 |
</div>
|
| 78 |
<div style="width: 30%; text-align: right;">
|
|
@@ -92,7 +163,8 @@ with gr.Blocks() as demo:
|
|
| 92 |
## π Preview Text Retrieval Results with Marqo Vector Database
|
| 93 |
<div style="font-size: 18px;">
|
| 94 |
<p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
|
| 95 |
-
To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work.
|
|
|
|
| 96 |
</div>""")
|
| 97 |
|
| 98 |
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project.
|
| 3 |
+
Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
# Import packages
|
| 7 |
import gradio as gr
|
| 8 |
import pandas as pd
|
|
|
|
| 28 |
# Pagination settings
|
| 29 |
R = 5 # Number of preview rows per page
|
| 30 |
|
| 31 |
+
def normalize_text(text):
|
| 32 |
+
"""Normalize text for better matching by removing extra whitespace and standardizing characters."""
|
| 33 |
+
# Remove extra whitespace
|
| 34 |
+
text = ' '.join(text.split())
|
| 35 |
+
# Could add more normalization steps here if needed
|
| 36 |
+
return text
|
| 37 |
+
|
| 38 |
+
def find_best_match(needle, haystack):
|
| 39 |
+
"""Find the best matching position of needle in haystack using fuzzy matching."""
|
| 40 |
+
matcher = SequenceMatcher(None, needle, haystack)
|
| 41 |
+
matches = matcher.get_matching_blocks()
|
| 42 |
+
|
| 43 |
+
# Find the best match that exceeds our threshold
|
| 44 |
+
best_match = None
|
| 45 |
+
best_match_ratio = 0.9 # Initialize the best match ratio with our minimum threshold
|
| 46 |
+
|
| 47 |
+
for match in matches:
|
| 48 |
+
i, j, n = match
|
| 49 |
+
if n > 0: # Only consider non-zero length matches
|
| 50 |
+
subsequence = haystack[j:j+n]
|
| 51 |
+
ratio = SequenceMatcher(None, needle, subsequence).ratio()
|
| 52 |
+
if ratio > best_match_ratio:
|
| 53 |
+
best_match = (j, j+n)
|
| 54 |
+
best_match_ratio = ratio
|
| 55 |
+
|
| 56 |
+
return best_match
|
| 57 |
+
|
| 58 |
def highlight_text(text, highlights):
|
| 59 |
+
"""
|
| 60 |
+
Highlight specified text segments using fuzzy matching and HTML mark tags.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
text (str): The original text to highlight
|
| 64 |
+
highlights (str or list): Text segment(s) to highlight
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
str: Text with highlights wrapped in <mark> tags
|
| 68 |
+
"""
|
| 69 |
+
if not text or not highlights:
|
| 70 |
+
return text
|
| 71 |
+
|
| 72 |
+
# Ensure highlights is a list
|
| 73 |
if isinstance(highlights, str):
|
| 74 |
highlights = [highlights]
|
| 75 |
+
|
| 76 |
+
# Remove empty or None highlights
|
| 77 |
+
highlights = [h for h in highlights if h]
|
| 78 |
+
if not highlights:
|
| 79 |
+
return text
|
| 80 |
+
|
| 81 |
+
# Sort highlights by length (longest first) to avoid nested highlights
|
| 82 |
+
highlights = sorted(highlights, key=len, reverse=True)
|
| 83 |
+
|
| 84 |
+
# Store positions to highlight
|
| 85 |
+
positions_to_highlight = []
|
| 86 |
+
|
| 87 |
+
# Find positions for each highlight
|
| 88 |
for highlight in highlights:
|
| 89 |
+
normalized_highlight = normalize_text(highlight)
|
| 90 |
+
normalized_text = normalize_text(text)
|
| 91 |
+
|
| 92 |
+
match = find_best_match(normalized_highlight, normalized_text)
|
| 93 |
+
if match:
|
| 94 |
+
start, end = match
|
| 95 |
+
# Convert positions back to original text
|
| 96 |
+
original_start = len(text[:start].rstrip())
|
| 97 |
+
original_end = original_start + len(text[start:end].strip())
|
| 98 |
+
positions_to_highlight.append((original_start, original_end))
|
| 99 |
+
|
| 100 |
+
# Sort positions by start position
|
| 101 |
+
positions_to_highlight.sort()
|
| 102 |
+
|
| 103 |
+
# Apply highlights from end to start to avoid position shifting
|
| 104 |
+
for start, end in reversed(positions_to_highlight):
|
| 105 |
+
text = f"{text[:start]}<mark>{text[start:end]}</mark>{text[end:]}"
|
| 106 |
+
|
| 107 |
return text
|
| 108 |
|
|
|
|
| 109 |
# Function to create preview rows
|
| 110 |
def preview_results(page, selected_data_source):
|
| 111 |
data_source = data_sources[selected_data_source]
|
|
|
|
| 141 |
return f"""
|
| 142 |
<div style="display: flex; justify-content: space-between; align-items: start;">
|
| 143 |
<div style="width: 65%; font-size: 18px;">
|
| 144 |
+
<h3>π Preview: {document_name}</h3>
|
| 145 |
<p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
|
| 146 |
+
<p><b>Text on page {row['page']}: </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
|
| 147 |
<p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">π Open ΓNB Viewer</a></p>
|
| 148 |
</div>
|
| 149 |
<div style="width: 30%; text-align: right;">
|
|
|
|
| 163 |
## π Preview Text Retrieval Results with Marqo Vector Database
|
| 164 |
<div style="font-size: 18px;">
|
| 165 |
<p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
|
| 166 |
+
To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work.
|
| 167 |
+
To inspect the page in the full book, click on <i>Open ONB Viewer</i> in the document details below.</p>
|
| 168 |
</div>""")
|
| 169 |
|
| 170 |
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
|