Spaces:

TBertinCorp
/

CityScan

Sleeping

App Files Files Community

Ubuntu commited on Apr 13, 2025

Commit

2c0c15a

1 Parent(s): c3e689d

label docs

Browse files

Files changed (2) hide show

app.py +2 -0
pages/2_recherche_docs.py +120 -56

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ os.environ['AWS_SECRET_ACCESS_KEY'] = st.secrets['AWS_SECRET_ACCESS_KEY']
 os.environ['AWS_REGION'] = st.secrets['AWS_REGION']
 os.environ['S3_FOLDER'] = st.secrets['S3_FOLDER']
 st.set_page_config(
     page_title="Login",
     page_icon="👋",

 os.environ['AWS_REGION'] = st.secrets['AWS_REGION']
 os.environ['S3_FOLDER'] = st.secrets['S3_FOLDER']
+PATH_S3_LABELS = os.path.join(st.secrets['S3_FOLDER'])
 st.set_page_config(
     page_title="Login",
     page_icon="👋",

pages/2_recherche_docs.py CHANGED Viewed

@@ -1,22 +1,34 @@
 import datetime
 import streamlit as st
 import pandas as pd
 import unicodedata
 from collections import deque, defaultdict
 import re
 st.set_page_config(page_title="recherche", page_icon="👋", layout="wide")
 if not st.session_state.get("user_login_success", False):
     st.switch_page("app.py")
-st.markdown("## Recherche de documents par mots clés")
 df_docs = st.session_state["db_docs"]
 df_cities = st.session_state["db_cities"]
-def format_text(input_text:str):
     if input_text is None:
         return input_text
     input_text = input_text.replace("\n", "")
@@ -27,11 +39,9 @@ def format_text(input_text:str):
     input_text = input_text.strip()
     return input_text
 def find_qualifying_groups(text, word_list, distance_threshold):
-    # Merge all indices into a single list with identification
     nb_words_to_find = len(word_list)
     merged_indices = []
     for i, word in enumerate(word_list):
@@ -43,51 +53,44 @@ def find_qualifying_groups(text, word_list, distance_threshold):
             merged_indices.extend([(i, j) for j in matches])
     merged_indices = sorted(merged_indices, key=lambda x: x[1])
-    # Use a sliding window to find all qualifying groups
     qualifying_groups = []
     window = deque()
-    indices_in_window = defaultdict(int)  # Tracks count of each char_id in the current window
-    for char_id, char_pos in merged_indices:
-        # Add new index to the window
         window.append((char_id, char_pos))
         indices_in_window[char_id] += 1
         while window and window[-1][1] - window[0][1] >= distance_threshold:
             removed_char_id, _ = window.popleft()
             indices_in_window[removed_char_id] -= 1
             if indices_in_window[removed_char_id] == 0:
                 del indices_in_window[removed_char_id]
-        # Check if we have a qualifying group (each char_id is represented exactly once)
         if len(indices_in_window) == nb_words_to_find:
             qualifying_groups.append([pos for _, pos in window])
     if len(qualifying_groups) == 0:
         return None
-    qualifying_groups = [(max(0,min(L)-100), min(max(L)+100, len(text))) for L in qualifying_groups]
     qualifying_groups = [text[g[0]:g[1]] for g in qualifying_groups]
     return ('\n - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n').join(qualifying_groups)
 with st.container(border=True):
-    col1, col2= st.columns([0.4,  0.2])
     search_text = col1.text_input(
         label="Mots clés",
-        placeholder="Separer chaque expression par une virgule",
     )
     search_distance = col2.slider(
         "Distance entre mots clés",
-        min_value= 100,
-        value = 600,
-        max_value = 1200,
         step=20
     )
@@ -95,15 +98,15 @@ with st.container(border=True):
     search_institution = col3.multiselect(
         label="Institution(s)",
-        default = ['Commmune'],
         options=["Commmune"],
     )
     search_regions = col4.multiselect(
-        label="Region(s)",
         options=['ALL'] + sorted(list(df_cities['region_name'].unique())),
-        default = ['ALL'],
-        placeholder="Region(s)",
     )
     now = datetime.datetime.now()
@@ -116,47 +119,108 @@ with st.container(border=True):
         max_value=now,
         format="DD.MM.YYYY",
     )
-    st.write("###")
-    col1, col2, col3 = st.columns([0.5, 0.3, 0.45])
-    go_search = col2.button("Rechercher", type="primary")
-if go_search and len(search_dates)==2:
     search_dates = pd.Timestamp(search_dates[0]), pd.Timestamp(search_dates[1])
     if 'ALL' not in search_regions:
         search_pattern = '|'.join(search_regions)
-        df_subset_docs = df_docs[df_docs["region"].str.contains(search_pattern)]
     else:
         df_subset_docs = df_docs
-    df_subset_docs = df_subset_docs[df_subset_docs["scan_date"].between(search_dates[0],search_dates[1])]
     df_subset_docs['selected_texts'] = ''
     all_search_expressions = search_text.split(',')
     valid_search_expressions = []
     for word in all_search_expressions:
         word = format_text(word)
         if word != '':
             valid_search_expressions.append(word)
-            df_subset_docs = df_subset_docs[df_subset_docs['text_content'].str.contains(word)] #will remove Nan values
-    df_subset_docs['selected_texts'] = df_subset_docs['text_content'].apply(lambda x : find_qualifying_groups(text = x,
-                                                                                                                word_list = valid_search_expressions,
-                                                                                                            distance_threshold = search_distance))
-    df_subset_docs = df_subset_docs.dropna(subset=['selected_texts'])
-    st.dataframe(
-        df_subset_docs[['scan_date', 'city_name','region', 'selected_texts', 'url']],
-        column_config={
-            'city_name' : st.column_config.TextColumn(width="medium"),
-            'region' : st.column_config.TextColumn(width="medium"),
-            "url": st.column_config.LinkColumn(width="large"),
-            "scan_date": st.column_config.DateColumn(disabled=True, width="small"),
-            'selected_texts' : st.column_config.TextColumn(width="large"),
-        }
     )

 import datetime
 import streamlit as st
 import pandas as pd
 import unicodedata
 from collections import deque, defaultdict
 import re
+import random
+import os
+from collections import defaultdict
+import awswrangler.s3 as s3
 st.set_page_config(page_title="recherche", page_icon="👋", layout="wide")
+# Redirect to login if not authenticated
 if not st.session_state.get("user_login_success", False):
     st.switch_page("app.py")
+PATH_S3_LABELS = os.path.join(st.secrets['S3_FOLDER'], 'MAIRIE_doc_labels.csv')
+#del st.session_state.doc_status
+# BE CAREFUL !
+if 'doc_status' not in st.session_state:
+    df_doc_status = s3.read_parquet(PATH_S3_LABELS)
+    doc_status = dict(zip(df_doc_status['id'], df_doc_status['label']))
+    st.session_state.doc_status = doc_status
 df_docs = st.session_state["db_docs"]
 df_cities = st.session_state["db_cities"]
+# Text normalization
+def format_text(input_text: str):
     if input_text is None:
         return input_text
     input_text = input_text.replace("\n", "")
     input_text = input_text.strip()
     return input_text
+# Search function
 def find_qualifying_groups(text, word_list, distance_threshold):
     nb_words_to_find = len(word_list)
     merged_indices = []
     for i, word in enumerate(word_list):
             merged_indices.extend([(i, j) for j in matches])
     merged_indices = sorted(merged_indices, key=lambda x: x[1])
     qualifying_groups = []
     window = deque()
+    indices_in_window = defaultdict(int)
+    for char_id, char_pos in merged_indices:
         window.append((char_id, char_pos))
         indices_in_window[char_id] += 1
         while window and window[-1][1] - window[0][1] >= distance_threshold:
             removed_char_id, _ = window.popleft()
             indices_in_window[removed_char_id] -= 1
             if indices_in_window[removed_char_id] == 0:
                 del indices_in_window[removed_char_id]
         if len(indices_in_window) == nb_words_to_find:
             qualifying_groups.append([pos for _, pos in window])
     if len(qualifying_groups) == 0:
         return None
+    qualifying_groups = [(max(0, min(L) - 100), min(max(L) + 100, len(text))) for L in qualifying_groups]
     qualifying_groups = [text[g[0]:g[1]] for g in qualifying_groups]
     return ('\n - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n').join(qualifying_groups)
+# UI
 with st.container(border=True):
+    st.markdown("### Recherche de documents par mots clés")
+    col1, col2 = st.columns([0.4, 0.2])
     search_text = col1.text_input(
         label="Mots clés",
+        placeholder="Séparer chaque expression par une virgule",
     )
     search_distance = col2.slider(
         "Distance entre mots clés",
+        min_value=100,
+        value=600,
+        max_value=1200,
         step=20
     )
     search_institution = col3.multiselect(
         label="Institution(s)",
+        default=['Commmune'],
         options=["Commmune"],
     )
     search_regions = col4.multiselect(
+        label="Région(s)",
         options=['ALL'] + sorted(list(df_cities['region_name'].unique())),
+        default=['ALL'],
+        placeholder="Région(s)",
     )
     now = datetime.datetime.now()
         max_value=now,
         format="DD.MM.YYYY",
     )
+    st.write("###")
+    col1, col2, col3 = st.columns([0.35, 0.3, 0.45])
+    go_search = col2.button("Rechercher", type="primary", use_container_width=True)
+# Only run search if button clicked OR results not in session yet
+if (go_search or "df_results" not in st.session_state) and len(search_dates) == 2:
     search_dates = pd.Timestamp(search_dates[0]), pd.Timestamp(search_dates[1])
     if 'ALL' not in search_regions:
         search_pattern = '|'.join(search_regions)
+        df_subset_docs = df_docs[df_docs["region"].str.contains(search_pattern, na=False)]
     else:
         df_subset_docs = df_docs
+    df_subset_docs = df_subset_docs[df_subset_docs["scan_date"].between(search_dates[0], search_dates[1])]
     df_subset_docs['selected_texts'] = ''
     all_search_expressions = search_text.split(',')
     valid_search_expressions = []
     for word in all_search_expressions:
         word = format_text(word)
         if word != '':
             valid_search_expressions.append(word)
+            df_subset_docs = df_subset_docs[df_subset_docs['text_content'].str.contains(word, na=False)]
+    df_subset_docs['selected_texts'] = df_subset_docs['text_content'].apply(
+        lambda x: find_qualifying_groups(
+            text=x,
+            word_list=valid_search_expressions,
+            distance_threshold=search_distance
+        )
     )
+    df_subset_docs = df_subset_docs.dropna(subset=['selected_texts'])
+    st.session_state.df_results = df_subset_docs[['id', 'scan_date', 'city_name', 'region', 'selected_texts', 'url']].reset_index(drop=True)
+if 'random_key' not in st.session_state:
+    st.session_state.random_key = random.random()
+# Display persisted results
+if "df_results" in st.session_state:
+    df_results = st.session_state.df_results
+    with st.container(border=True):
+        st.markdown("### Resultats")
+        def get_colors(row):
+            status = st.session_state.doc_status.get(row['id'], None)
+            if status == 'favorite':
+                color = ['background-color:#e6ffea'] * 6
+            elif status == 'backlog':
+                color = ['background-color:#ffe5e5'] * 6
+            else:
+                color=['background-color:#e6f0ff'] * 6
+            return color
+        event = st.dataframe(
+            df_results.style.apply(get_colors, axis=1),
+            column_config={
+                'id':st.column_config.TextColumn("Id", disabled=True, width="small"),
+                'scan_date': st.column_config.DateColumn("Date Scan", disabled=True, format="DD.MM.YYYY", width="small"),
+                'city_name': st.column_config.TextColumn("Ville", disabled=True, width="small"),
+                'region': st.column_config.TextColumn("Région", disabled=True, width="small"),
+                'selected_texts': st.column_config.TextColumn("Extrait", disabled=True, width="large"),
+                'url': st.column_config.LinkColumn("URL", disabled=True, width="medium"),
+            },
+            use_container_width=True,
+            on_select="rerun",
+            hide_index=True,
+            key=st.session_state.random_key,
+            selection_mode="multi-row",
+            #row_style=lambda row: "background-color: lightgreen;" if row["to_look_at"] else "background-color: #ffd6d6;"
+        )
+        #st.session_state.df_results = df_updated
+        _, col22, _, col33, _ = st.columns([0.36, 0.2,0.1, 0.2, 0.4])
+        button_add_doc_backlog = col22.button(':x:', use_container_width=True)
+        button_add_doc_interest = col33.button(':white_check_mark:', use_container_width=True)
+        if button_add_doc_backlog:
+            for element in event.selection['rows']:
+                st.session_state.doc_status[df_results.iloc[element]['id']] = 'backlog'
+            st.session_state.random_key = random.random()
+            st.rerun()
+        if button_add_doc_interest:
+            for element in event.selection['rows']:
+                st.session_state.doc_status[df_results.iloc[element]['id']] = 'favorite'
+            st.session_state.random_key = random.random()
+            st.rerun()
+st.text('')
+_, center_col_save, _ = st.columns([0.3,0.2, 0.35])
+save_selection_to_s3 = center_col_save.button("Sauvergarder les labels", type="primary", use_container_width=True)
+if save_selection_to_s3:
+    df_doc_status = pd.DataFrame(list(st.session_state.doc_status.items()), columns=['id', 'label'])
+    s3.to_parquet(df = df_doc_status, path = PATH_S3_LABELS)