Spaces:

GEODE
/

encyclopedia-coordinates-viewer

Running

File size: 11,053 Bytes

import gradio as gr
from datasets import load_dataset
import pandas as pd
import re
import folium
import numpy as np

# --- 1. Initial Data Loading ---
print("Loading datasets...")
dfs = {}

try:
    print("- Loading EDDA...")
    dfs["Encyclopédie de Diderot et d'Alembert"] = load_dataset("GEODE/edda-coordinata", split="train").to_pandas()
    
    print("- Loading EB7...")
    dfs["Encyclopædia Britannica 7th edition"] = load_dataset("pnugues/EB7", split="train").to_pandas()

    print("- Loading EB9...")
    dfs["Encyclopædia Britannica 9th edition"] = load_dataset("pnugues/EB9", split="train").to_pandas()
    
    print("Loading complete!")
except Exception as e:
    print(f"Error loading datasets: {e}")

# --- 2. Utility Functions ---
def parse_coordinate(coord_str, meridian_name=None):
    if not isinstance(coord_str, str): return None, None
    
    pattern = r"(\d+)\s*(?:(\d+)')?\s*(?:(\d+)[\"']{1,2})?\s*([NSEW])"
    matches = re.findall(pattern, coord_str)
    
    lat_val, lon_val = None, None
    is_west, is_east = False, False

    for m in matches:
        deg = float(m[0]) if m[0] else 0
        minute = float(m[1]) if m[1] else 0
        sec = float(m[2]) if m[2] else 0
        val = deg + (minute / 60) + (sec / 3600)
        
        direction = m[3]
        if direction in ['N', 'S']:
            lat_val = val if direction == 'N' else -val
        elif direction in ['E', 'W']:
            lon_val = val
            is_west = (direction == 'W')
            is_east = (direction == 'E')

    if lat_val is not None and lon_val is not None:
        m_name = meridian_name.strip() if isinstance(meridian_name, str) and meridian_name.strip() else "île de Fer"
        
        if m_name == "Pékin":
            lon_val = 116.39 + lon_val if is_west else 116.39 - lon_val
        else:
            final_lon = lon_val if is_east else -lon_val
            
            if m_name == "Paris":
                lon_val = final_lon + 2.35
            elif m_name == "Lunden": 
                lon_val = final_lon + 13.19
            elif m_name in ["Londres", "London"]:
                lon_val = final_lon + 0.0
            elif m_name == "Sydon":
                lon_val = final_lon + 35.37
            else: 
                lon_val = final_lon - 17.66
                
        return lat_val, lon_val
    return None, None

def classify_geometry(x):
    if not isinstance(x, (list, np.ndarray)) or len(x) == 0: return "none"
    if len(x) == 1 and isinstance(x[0], (list, np.ndarray)):
        return "point" if len(x[0]) == 1 else "surface"
    if len(x) > 1 and isinstance(x[0], (list, np.ndarray)) and len(x[0]) == 1:
        if x[0][0] in ['subart', 'multsrc', 'pchain', 'misc']: return x[0][0]
    return "unknown"

def get_meridian_safely(meridian_list, index):
    if not isinstance(meridian_list, (list, np.ndarray)):
        return "île de Fer"
    if index < len(meridian_list):
        val = meridian_list[index]
        if isinstance(val, str) and val.strip() != "":
            return val.strip()
    return "île de Fer"

# --- 3. Search and Mapping Engine ---
def search_and_map(query, search_mode, dataset_choice):
    df = dfs.get(dataset_choice)
    if not query or df is None:
        return pd.DataFrame(), folium.Map(location=[46.2, 2.2], zoom_start=4)._repr_html_()
    
    # Group EB7 and EB9 under the same logic
    is_eb = dataset_choice in ["Encyclopædia Britannica 7th edition", "Encyclopædia Britannica 9th edition"]
    
    # 1. Column management for search
    if is_eb:
        search_col = "texte" if search_mode == "text" else "vedette"
    else:
        search_col = "text" if search_mode == "text" else "headword"

    # Filtering
    mask = df[search_col].str.contains(r'\b' + re.escape(query) + r'\b', case=False, na=False)
    results = df[mask].copy()
    
    m = folium.Map(location=[46.2, 2.2], zoom_start=4)
    bounds = []

    # 2. Map rendering loop
    for _, row in results.iterrows():
        
        # --- BRITANNICA BRANCH (EB7 & EB9) ---
        if is_eb:
            coords_str = row.get('coords', '')
            texte_val = str(row.get('texte', ''))
            
            headword = row.get('vedette', 'Unknown article')
            snippet = (texte_val[:150] + '...') if len(texte_val) > 150 else texte_val

            if isinstance(coords_str, str) and coords_str.strip():
                # Force London meridian for British editions
                lat, lon = parse_coordinate(coords_str, "Londres")
                if lat is not None:
                    popup_html = f"<b>{headword}</b><br><i>Meridian: London (Greenwich)</i><br><br>{snippet}"
                    folium.Marker([lat, lon], popup=popup_html, tooltip=headword).add_to(m)
                    bounds.append([lat, lon])

        # --- ENCYCLOPÉDIE BRANCH (EDDA) ---
        else:
            meridian_list = row.get('meridian', [])
            if isinstance(meridian_list, np.ndarray):
                meridian_list = meridian_list.tolist()

            coords_raw = row.get('coordinates', [])
            if isinstance(coords_raw, (list, np.ndarray)):
                coords_list = [item.tolist() if isinstance(item, np.ndarray) else item for item in coords_raw]
            else:
                continue

            geom_type = classify_geometry(coords_list)
            headword = row.get('headword', 'Unknown')
            
            texte_val = str(row.get('text', ''))
            snippet = (texte_val[:150] + '...') if len(texte_val) > 150 else texte_val

            try:
                if geom_type == "point":
                    current_meridian = get_meridian_safely(meridian_list, 0)
                    lat, lon = parse_coordinate(coords_list[0][0], current_meridian)
                    if lat is not None:
                        popup_html = f"<b>{headword}</b><br><i>Meridian: {current_meridian}</i><br><br>{snippet}"
                        folium.Marker([lat, lon], popup=popup_html, tooltip=headword).add_to(m)
                        bounds.append([lat, lon])

                elif geom_type == "surface":
                    current_meridian = get_meridian_safely(meridian_list, 0)
                    p1 = parse_coordinate(coords_list[0][0], current_meridian)
                    p2 = parse_coordinate(coords_list[0][1], current_meridian)
                    if p1[0] is not None and p2[0] is not None:
                        popup_html = f"<b>{headword}</b> (Area)<br><i>Meridian: {current_meridian}</i>"
                        folium.Rectangle(bounds=[p1, p2], color="orange", fill=True, popup=popup_html).add_to(m)
                        bounds.extend([p1, p2])

                elif geom_type in ["subart", "multsrc", "pchain"]:
                    points = []
                    for i, item in enumerate(coords_list[1:]):
                        c_str = item[0] if isinstance(item, (list, np.ndarray)) else item
                        current_meridian = get_meridian_safely(meridian_list, i)
                        p = parse_coordinate(c_str, current_meridian)
                        if p[0] is not None: 
                            points.append((p[0], p[1], current_meridian))
                    
                    if points:
                        if geom_type == "pchain":
                            coords_only = [[pt[0], pt[1]] for pt in points]
                            popup_html = f"<b>{headword}</b> (Path)<br><i>Meridian: {points[0][2]}</i>"
                            folium.PolyLine(coords_only, color="blue", weight=3, popup=popup_html).add_to(m)
                            bounds.extend(coords_only)
                        else:
                            for pt in points:
                                popup_html = f"<b>{headword}</b><br><i>Meridian: {pt[2]}</i><br><br>{snippet}"
                                folium.Marker([pt[0], pt[1]], icon=folium.Icon(color='green'), popup=popup_html, tooltip=headword).add_to(m)
                            bounds.extend([[pt[0], pt[1]] for pt in points])
            except Exception as e:
                print(f"EDDA rendering error for {headword}: {e}")

    # --- 3. Zoom Logic ---
    if bounds:
        unique_pts = np.unique(bounds, axis=0)
        if len(unique_pts) <= 1:
            m.location = unique_pts[0].tolist()
            m.zoom_start = 5
        else:
            m.fit_bounds(bounds)
    
    # --- 4. Final Dataframe Formatting ---
    if is_eb:
        final_df = results[['vedette', 'coords', 'texte']].head(50).copy()
        final_df['texte'] = final_df['texte'].astype(str).str.slice(0, 280) + '...'
    else:
        final_df = results[['headword', 'coordinates', 'meridian', 'text']].head(50).copy()
        #final_df['coordinates'] = final_df['coordinates'].astype(str)
        final_df['text'] = final_df['text'].astype(str).str.slice(0, 280) + '...'
    
    return final_df, m._repr_html_()

# --- 4. Gradio Interface ---
description = """
# 🌍 Historical Encyclopedias Coordinates Explorer
---

**Disclaimer:** This application is a demonstration prototype currently under development.

This application allows you to explore and compare manually annotated geographical coordinates from several major 18th and 19th-century encyclopedias: 
* The **Encyclopédie** by Diderot and d'Alembert (FR, ~1751): https://huggingface.co/datasets/GEODE/edda-coordinata
* The **Encyclopædia Britannica** 7th edition (EN, ~1842): https://huggingface.co/datasets/pnugues/EB7
* The **Encyclopædia Britannica** 9th edition (EN, ~1875): https://huggingface.co/datasets/pnugues/EB9

Select the dataset, then search for any term within the article's text or its title (headword). The corresponding coordinates will be automatically projected onto the interactive map.

---
"""

with gr.Blocks(title="Historical Encyclopedias Coordinates Explorer") as demo:
    gr.Markdown(description)

    with gr.Row():
        with gr.Column(scale=1):
            dataset_dropdown = gr.Dropdown(
                choices=[
                    "Encyclopédie de Diderot et d'Alembert", 
                    "Encyclopædia Britannica 7th edition",
                    "Encyclopædia Britannica 9th edition"
                ],
                value="Encyclopédie de Diderot et d'Alembert",
                label="Choose Dataset"
            )
            search_input = gr.Textbox(label="Search term", placeholder="E.g., Acapulco, Brest, Berlin...")
            search_mode = gr.Radio(choices=["headword", "text"], value="headword", label="Search in:")
            btn = gr.Button("Search on map", variant="primary")
        
        with gr.Column(scale=2):
            map_output = gr.HTML(label="Map Visualization")

    table_output = gr.Dataframe(label="Results (max 50)", interactive=False, wrap=True)

    inputs = [search_input, search_mode, dataset_dropdown]
    outputs = [table_output, map_output]
    
    btn.click(search_and_map, inputs, outputs)
    search_input.submit(search_and_map, inputs, outputs)

if __name__ == "__main__":
    demo.launch()