historic score matching
Browse files- app.py +1 -1
- inference_tab/inference_logic.py +76 -2
- inference_tab/inference_setup.py +16 -4
app.py
CHANGED
|
@@ -16,7 +16,7 @@ logging.basicConfig(level=logging.DEBUG)
|
|
| 16 |
|
| 17 |
with gr.Blocks() as demo:
|
| 18 |
with gr.Tab("Inference"):
|
| 19 |
-
image_input, gcp_input, city_name,user_crs, score_th, run_button, output, download_file = get_inference_widgets(run_inference,georefImg)
|
| 20 |
with gr.Tab("Annotation"):
|
| 21 |
get_annotation_widgets()
|
| 22 |
with gr.Tab("Map"):
|
|
|
|
| 16 |
|
| 17 |
with gr.Blocks() as demo:
|
| 18 |
with gr.Tab("Inference"):
|
| 19 |
+
image_input, gcp_input, city_name,user_crs, score_th, hist_th, hist_dic, run_button, output, download_file = get_inference_widgets(run_inference,georefImg)
|
| 20 |
with gr.Tab("Annotation"):
|
| 21 |
get_annotation_widgets()
|
| 22 |
with gr.Tab("Map"):
|
inference_tab/inference_logic.py
CHANGED
|
@@ -34,7 +34,7 @@ _trocr_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 34 |
|
| 35 |
|
| 36 |
|
| 37 |
-
def run_inference(tile_dict, gcp_path, city_name, score_th):
|
| 38 |
IMAGE_FOLDER = os.path.join(OUTPUT_DIR, "blobs")
|
| 39 |
CSV_FILE = os.path.join(OUTPUT_DIR, "annotations.csv")
|
| 40 |
MASK_FILE = os.path.join(OUTPUT_DIR, "mask.tif")
|
|
@@ -110,6 +110,18 @@ def run_inference(tile_dict, gcp_path, city_name, score_th):
|
|
| 110 |
log += msg + "\n"
|
| 111 |
yield log, None
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
def load_trocr_model():
|
|
@@ -556,7 +568,6 @@ def georefImg(image_path, gcp_path, user_crs):
|
|
| 556 |
format='GTiff'
|
| 557 |
)
|
| 558 |
|
| 559 |
-
|
| 560 |
|
| 561 |
yield "The map is georeferenced."
|
| 562 |
|
|
@@ -607,6 +618,69 @@ def extractStreetNet(city_name):
|
|
| 607 |
|
| 608 |
|
| 609 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
def fuzzyMatch(score_th,tile_dict):
|
| 611 |
COORD_PATH=os.path.join(OUTPUT_DIR,"centroids.csv")
|
| 612 |
OCR_PATH=os.path.join(OUTPUT_DIR,"ocr.csv")
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
|
| 37 |
+
def run_inference(tile_dict, gcp_path, city_name, score_th, hist_th, hist_dic):
|
| 38 |
IMAGE_FOLDER = os.path.join(OUTPUT_DIR, "blobs")
|
| 39 |
CSV_FILE = os.path.join(OUTPUT_DIR, "annotations.csv")
|
| 40 |
MASK_FILE = os.path.join(OUTPUT_DIR, "mask.tif")
|
|
|
|
| 110 |
log += msg + "\n"
|
| 111 |
yield log, None
|
| 112 |
|
| 113 |
+
if hist_dic is not None:
|
| 114 |
+
# Run fuzzy match against historic street names
|
| 115 |
+
for msg in fuzzyMatchHist(hist_dic, hist_th):
|
| 116 |
+
if isinstance(msg, list):
|
| 117 |
+
log += "Historic fuzzy matching finished! CSVs saved at:\n"
|
| 118 |
+
for f in msg:
|
| 119 |
+
log += f" - {f}\n"
|
| 120 |
+
yield log, msg
|
| 121 |
+
else:
|
| 122 |
+
log += msg + "\n"
|
| 123 |
+
yield log, None
|
| 124 |
+
|
| 125 |
|
| 126 |
|
| 127 |
def load_trocr_model():
|
|
|
|
| 568 |
format='GTiff'
|
| 569 |
)
|
| 570 |
|
|
|
|
| 571 |
|
| 572 |
yield "The map is georeferenced."
|
| 573 |
|
|
|
|
| 618 |
|
| 619 |
|
| 620 |
|
| 621 |
+
def fuzzyMatchHist(hist_dic, hist_th):
|
| 622 |
+
# Convert threshold to numeric
|
| 623 |
+
hist_th = int(hist_th)
|
| 624 |
+
|
| 625 |
+
# === Load Data ===
|
| 626 |
+
hist_df = pd.read_csv(hist_dic,header=None, names=["hist_name"])
|
| 627 |
+
OCR_PATH = os.path.join(OUTPUT_DIR, "ocr.csv")
|
| 628 |
+
|
| 629 |
+
names_df = pd.read_csv(
|
| 630 |
+
OCR_PATH,
|
| 631 |
+
names=['blob_id', 'pred_text'],
|
| 632 |
+
dtype={"blob_id": "int64", "pred_text": "string"}
|
| 633 |
+
)
|
| 634 |
+
|
| 635 |
+
historic_names = hist_df["hist_name"].dropna().astype(str).tolist()
|
| 636 |
+
|
| 637 |
+
# === Fuzzy Match ===
|
| 638 |
+
results = []
|
| 639 |
+
for _, row in names_df.iterrows():
|
| 640 |
+
ocr_name = row["pred_text"]
|
| 641 |
+
if pd.isna(ocr_name):
|
| 642 |
+
continue
|
| 643 |
+
|
| 644 |
+
best_match, best_score = process.extractOne(
|
| 645 |
+
ocr_name,
|
| 646 |
+
historic_names,
|
| 647 |
+
scorer=fuzz.token_sort_ratio
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
results.append({
|
| 651 |
+
"blob_id": row["blob_id"],
|
| 652 |
+
"ocr_name": ocr_name,
|
| 653 |
+
"best_hist_match": best_match,
|
| 654 |
+
"match_score": best_score
|
| 655 |
+
})
|
| 656 |
+
|
| 657 |
+
results_df = pd.DataFrame(results)
|
| 658 |
+
|
| 659 |
+
# === Save all results ===
|
| 660 |
+
all_results_path = os.path.join(OUTPUT_DIR, "historic_matches.csv")
|
| 661 |
+
results_df.to_csv(all_results_path, index=False)
|
| 662 |
+
|
| 663 |
+
# === Filter for manual annotation ===
|
| 664 |
+
manual_df = results_df[results_df["match_score"] >= hist_th]
|
| 665 |
+
for blob_id in manual_df['blob_id']:
|
| 666 |
+
# original blob
|
| 667 |
+
orig_path = os.path.join(OUTPUT_DIR, "blobs", f"{blob_id}.png")
|
| 668 |
+
if os.path.exists(orig_path):
|
| 669 |
+
os.remove(orig_path)
|
| 670 |
+
|
| 671 |
+
# marginalized blob
|
| 672 |
+
margin_path = os.path.join(OUTPUT_DIR, "blobs", f"{blob_id}_margin.png")
|
| 673 |
+
if os.path.exists(margin_path):
|
| 674 |
+
os.remove(margin_path)
|
| 675 |
+
|
| 676 |
+
yield "Historic fuzzy matching complete."
|
| 677 |
+
yield [all_results_path]
|
| 678 |
+
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
|
| 684 |
def fuzzyMatch(score_th,tile_dict):
|
| 685 |
COORD_PATH=os.path.join(OUTPUT_DIR,"centroids.csv")
|
| 686 |
OCR_PATH=os.path.join(OUTPUT_DIR,"ocr.csv")
|
inference_tab/inference_setup.py
CHANGED
|
@@ -73,7 +73,8 @@ def select_tile(evt: gr.SelectData,state):
|
|
| 73 |
return None, gr.update(interactive=False), state
|
| 74 |
|
| 75 |
|
| 76 |
-
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
def get_inference_widgets(run_inference,georefImg):
|
|
@@ -101,7 +102,18 @@ def get_inference_widgets(run_inference,georefImg):
|
|
| 101 |
type="numpy", label="Selected Tile",
|
| 102 |
height=500, width=500
|
| 103 |
)
|
| 104 |
-
score_th = gr.Textbox(label="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
run_button = gr.Button("Run Inference", interactive=False)
|
| 106 |
output = gr.Textbox(label="Progress", lines=5, interactive=False)
|
| 107 |
download_file = gr.File(label="Download CSV",
|
|
@@ -122,7 +134,7 @@ def get_inference_widgets(run_inference,georefImg):
|
|
| 122 |
)
|
| 123 |
run_button.click(
|
| 124 |
fn=run_inference,
|
| 125 |
-
inputs=[selected_tile_path, gcp_input, city_name, score_th],
|
| 126 |
outputs=[output, download_file]
|
| 127 |
)
|
| 128 |
|
|
@@ -133,4 +145,4 @@ def get_inference_widgets(run_inference,georefImg):
|
|
| 133 |
)
|
| 134 |
|
| 135 |
|
| 136 |
-
return image_input, gcp_input, city_name, user_crs, score_th, run_button, output, download_file
|
|
|
|
| 73 |
return None, gr.update(interactive=False), state
|
| 74 |
|
| 75 |
|
| 76 |
+
def enable_textbox(file):
|
| 77 |
+
return gr.update(interactive=bool(file))
|
| 78 |
|
| 79 |
|
| 80 |
def get_inference_widgets(run_inference,georefImg):
|
|
|
|
| 102 |
type="numpy", label="Selected Tile",
|
| 103 |
height=500, width=500
|
| 104 |
)
|
| 105 |
+
score_th = gr.Textbox(label="Score threshold below which to annotate manually (OSM)",
|
| 106 |
+
info="Computes fuzzy match of the detected street names with OSM street names within 100m buffer")
|
| 107 |
+
|
| 108 |
+
# Historic dictionary of street names and matching score threshold
|
| 109 |
+
hist_dic = gr.File(label="Upload csv with historic street names",file_types=[".csv"])
|
| 110 |
+
hist_th = gr.Textbox(label="Score threshold below which to annotate manually (Directory)",
|
| 111 |
+
info="Computes fuzzy match of the detected street names with the historic street names",
|
| 112 |
+
interactive=False)
|
| 113 |
+
hist_dic.change(enable_textbox, inputs=hist_dic, outputs=hist_th)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
run_button = gr.Button("Run Inference", interactive=False)
|
| 118 |
output = gr.Textbox(label="Progress", lines=5, interactive=False)
|
| 119 |
download_file = gr.File(label="Download CSV",
|
|
|
|
| 134 |
)
|
| 135 |
run_button.click(
|
| 136 |
fn=run_inference,
|
| 137 |
+
inputs=[selected_tile_path, gcp_input, city_name, score_th, hist_th,hist_dic],
|
| 138 |
outputs=[output, download_file]
|
| 139 |
)
|
| 140 |
|
|
|
|
| 145 |
)
|
| 146 |
|
| 147 |
|
| 148 |
+
return image_input, gcp_input, city_name, user_crs, score_th, hist_th,hist_dic, run_button, output, download_file
|