Ubuntu commited on
Commit
2c0c15a
·
1 Parent(s): c3e689d

label docs

Browse files
Files changed (2) hide show
  1. app.py +2 -0
  2. pages/2_recherche_docs.py +120 -56
app.py CHANGED
@@ -12,6 +12,8 @@ os.environ['AWS_SECRET_ACCESS_KEY'] = st.secrets['AWS_SECRET_ACCESS_KEY']
12
  os.environ['AWS_REGION'] = st.secrets['AWS_REGION']
13
  os.environ['S3_FOLDER'] = st.secrets['S3_FOLDER']
14
 
 
 
15
  st.set_page_config(
16
  page_title="Login",
17
  page_icon="👋",
 
12
  os.environ['AWS_REGION'] = st.secrets['AWS_REGION']
13
  os.environ['S3_FOLDER'] = st.secrets['S3_FOLDER']
14
 
15
+ PATH_S3_LABELS = os.path.join(st.secrets['S3_FOLDER'])
16
+
17
  st.set_page_config(
18
  page_title="Login",
19
  page_icon="👋",
pages/2_recherche_docs.py CHANGED
@@ -1,22 +1,34 @@
1
  import datetime
2
-
3
  import streamlit as st
4
  import pandas as pd
5
  import unicodedata
6
  from collections import deque, defaultdict
7
  import re
 
 
 
 
8
 
9
  st.set_page_config(page_title="recherche", page_icon="👋", layout="wide")
10
 
 
11
  if not st.session_state.get("user_login_success", False):
12
  st.switch_page("app.py")
13
 
14
- st.markdown("## Recherche de documents par mots clés")
 
 
 
 
 
 
 
15
 
16
  df_docs = st.session_state["db_docs"]
17
  df_cities = st.session_state["db_cities"]
18
 
19
- def format_text(input_text:str):
 
20
  if input_text is None:
21
  return input_text
22
  input_text = input_text.replace("\n", "")
@@ -27,11 +39,9 @@ def format_text(input_text:str):
27
  input_text = input_text.strip()
28
  return input_text
29
 
 
30
  def find_qualifying_groups(text, word_list, distance_threshold):
31
- # Merge all indices into a single list with identification
32
-
33
  nb_words_to_find = len(word_list)
34
-
35
  merged_indices = []
36
 
37
  for i, word in enumerate(word_list):
@@ -43,51 +53,44 @@ def find_qualifying_groups(text, word_list, distance_threshold):
43
  merged_indices.extend([(i, j) for j in matches])
44
 
45
  merged_indices = sorted(merged_indices, key=lambda x: x[1])
46
-
47
- # Use a sliding window to find all qualifying groups
48
  qualifying_groups = []
49
  window = deque()
50
- indices_in_window = defaultdict(int) # Tracks count of each char_id in the current window
51
-
52
- for char_id, char_pos in merged_indices:
53
 
54
- # Add new index to the window
55
  window.append((char_id, char_pos))
56
  indices_in_window[char_id] += 1
57
-
58
  while window and window[-1][1] - window[0][1] >= distance_threshold:
59
  removed_char_id, _ = window.popleft()
60
  indices_in_window[removed_char_id] -= 1
61
  if indices_in_window[removed_char_id] == 0:
62
  del indices_in_window[removed_char_id]
63
-
64
- # Check if we have a qualifying group (each char_id is represented exactly once)
65
  if len(indices_in_window) == nb_words_to_find:
66
  qualifying_groups.append([pos for _, pos in window])
67
-
68
 
69
  if len(qualifying_groups) == 0:
70
  return None
71
-
72
- qualifying_groups = [(max(0,min(L)-100), min(max(L)+100, len(text))) for L in qualifying_groups]
73
  qualifying_groups = [text[g[0]:g[1]] for g in qualifying_groups]
74
  return ('\n - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n').join(qualifying_groups)
75
 
76
-
77
-
78
  with st.container(border=True):
79
-
80
- col1, col2= st.columns([0.4, 0.2])
81
  search_text = col1.text_input(
82
  label="Mots clés",
83
- placeholder="Separer chaque expression par une virgule",
84
  )
85
 
86
  search_distance = col2.slider(
87
  "Distance entre mots clés",
88
- min_value= 100,
89
- value = 600,
90
- max_value = 1200,
91
  step=20
92
  )
93
 
@@ -95,15 +98,15 @@ with st.container(border=True):
95
 
96
  search_institution = col3.multiselect(
97
  label="Institution(s)",
98
- default = ['Commmune'],
99
  options=["Commmune"],
100
  )
101
 
102
  search_regions = col4.multiselect(
103
- label="Region(s)",
104
  options=['ALL'] + sorted(list(df_cities['region_name'].unique())),
105
- default = ['ALL'],
106
- placeholder="Region(s)",
107
  )
108
 
109
  now = datetime.datetime.now()
@@ -116,47 +119,108 @@ with st.container(border=True):
116
  max_value=now,
117
  format="DD.MM.YYYY",
118
  )
119
- st.write("###")
120
-
121
- col1, col2, col3 = st.columns([0.5, 0.3, 0.45])
122
- go_search = col2.button("Rechercher", type="primary")
123
 
 
124
 
125
- if go_search and len(search_dates)==2:
 
126
 
 
 
127
  search_dates = pd.Timestamp(search_dates[0]), pd.Timestamp(search_dates[1])
128
-
129
  if 'ALL' not in search_regions:
130
  search_pattern = '|'.join(search_regions)
131
- df_subset_docs = df_docs[df_docs["region"].str.contains(search_pattern)]
132
  else:
133
  df_subset_docs = df_docs
134
- df_subset_docs = df_subset_docs[df_subset_docs["scan_date"].between(search_dates[0],search_dates[1])]
 
135
 
136
  df_subset_docs['selected_texts'] = ''
137
  all_search_expressions = search_text.split(',')
138
  valid_search_expressions = []
 
139
  for word in all_search_expressions:
140
  word = format_text(word)
141
  if word != '':
142
  valid_search_expressions.append(word)
143
- df_subset_docs = df_subset_docs[df_subset_docs['text_content'].str.contains(word)] #will remove Nan values
144
-
145
- df_subset_docs['selected_texts'] = df_subset_docs['text_content'].apply(lambda x : find_qualifying_groups(text = x,
146
- word_list = valid_search_expressions,
147
- distance_threshold = search_distance))
148
- df_subset_docs = df_subset_docs.dropna(subset=['selected_texts'])
149
-
150
- st.dataframe(
151
- df_subset_docs[['scan_date', 'city_name','region', 'selected_texts', 'url']],
152
- column_config={
153
- 'city_name' : st.column_config.TextColumn(width="medium"),
154
- 'region' : st.column_config.TextColumn(width="medium"),
155
- "url": st.column_config.LinkColumn(width="large"),
156
- "scan_date": st.column_config.DateColumn(disabled=True, width="small"),
157
- 'selected_texts' : st.column_config.TextColumn(width="large"),
158
- }
159
  )
160
-
161
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
 
1
  import datetime
 
2
  import streamlit as st
3
  import pandas as pd
4
  import unicodedata
5
  from collections import deque, defaultdict
6
  import re
7
+ import random
8
+ import os
9
+ from collections import defaultdict
10
+ import awswrangler.s3 as s3
11
 
12
  st.set_page_config(page_title="recherche", page_icon="👋", layout="wide")
13
 
14
+ # Redirect to login if not authenticated
15
  if not st.session_state.get("user_login_success", False):
16
  st.switch_page("app.py")
17
 
18
+ PATH_S3_LABELS = os.path.join(st.secrets['S3_FOLDER'], 'MAIRIE_doc_labels.csv')
19
+
20
+ #del st.session_state.doc_status
21
+ # BE CAREFUL !
22
+ if 'doc_status' not in st.session_state:
23
+ df_doc_status = s3.read_parquet(PATH_S3_LABELS)
24
+ doc_status = dict(zip(df_doc_status['id'], df_doc_status['label']))
25
+ st.session_state.doc_status = doc_status
26
 
27
  df_docs = st.session_state["db_docs"]
28
  df_cities = st.session_state["db_cities"]
29
 
30
+ # Text normalization
31
+ def format_text(input_text: str):
32
  if input_text is None:
33
  return input_text
34
  input_text = input_text.replace("\n", "")
 
39
  input_text = input_text.strip()
40
  return input_text
41
 
42
+ # Search function
43
  def find_qualifying_groups(text, word_list, distance_threshold):
 
 
44
  nb_words_to_find = len(word_list)
 
45
  merged_indices = []
46
 
47
  for i, word in enumerate(word_list):
 
53
  merged_indices.extend([(i, j) for j in matches])
54
 
55
  merged_indices = sorted(merged_indices, key=lambda x: x[1])
 
 
56
  qualifying_groups = []
57
  window = deque()
58
+ indices_in_window = defaultdict(int)
 
 
59
 
60
+ for char_id, char_pos in merged_indices:
61
  window.append((char_id, char_pos))
62
  indices_in_window[char_id] += 1
63
+
64
  while window and window[-1][1] - window[0][1] >= distance_threshold:
65
  removed_char_id, _ = window.popleft()
66
  indices_in_window[removed_char_id] -= 1
67
  if indices_in_window[removed_char_id] == 0:
68
  del indices_in_window[removed_char_id]
69
+
 
70
  if len(indices_in_window) == nb_words_to_find:
71
  qualifying_groups.append([pos for _, pos in window])
 
72
 
73
  if len(qualifying_groups) == 0:
74
  return None
75
+
76
+ qualifying_groups = [(max(0, min(L) - 100), min(max(L) + 100, len(text))) for L in qualifying_groups]
77
  qualifying_groups = [text[g[0]:g[1]] for g in qualifying_groups]
78
  return ('\n - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n').join(qualifying_groups)
79
 
80
+ # UI
 
81
  with st.container(border=True):
82
+ st.markdown("### Recherche de documents par mots clés")
83
+ col1, col2 = st.columns([0.4, 0.2])
84
  search_text = col1.text_input(
85
  label="Mots clés",
86
+ placeholder="Séparer chaque expression par une virgule",
87
  )
88
 
89
  search_distance = col2.slider(
90
  "Distance entre mots clés",
91
+ min_value=100,
92
+ value=600,
93
+ max_value=1200,
94
  step=20
95
  )
96
 
 
98
 
99
  search_institution = col3.multiselect(
100
  label="Institution(s)",
101
+ default=['Commmune'],
102
  options=["Commmune"],
103
  )
104
 
105
  search_regions = col4.multiselect(
106
+ label="Région(s)",
107
  options=['ALL'] + sorted(list(df_cities['region_name'].unique())),
108
+ default=['ALL'],
109
+ placeholder="Région(s)",
110
  )
111
 
112
  now = datetime.datetime.now()
 
119
  max_value=now,
120
  format="DD.MM.YYYY",
121
  )
 
 
 
 
122
 
123
+ st.write("###")
124
 
125
+ col1, col2, col3 = st.columns([0.35, 0.3, 0.45])
126
+ go_search = col2.button("Rechercher", type="primary", use_container_width=True)
127
 
128
+ # Only run search if button clicked OR results not in session yet
129
+ if (go_search or "df_results" not in st.session_state) and len(search_dates) == 2:
130
  search_dates = pd.Timestamp(search_dates[0]), pd.Timestamp(search_dates[1])
131
+
132
  if 'ALL' not in search_regions:
133
  search_pattern = '|'.join(search_regions)
134
+ df_subset_docs = df_docs[df_docs["region"].str.contains(search_pattern, na=False)]
135
  else:
136
  df_subset_docs = df_docs
137
+
138
+ df_subset_docs = df_subset_docs[df_subset_docs["scan_date"].between(search_dates[0], search_dates[1])]
139
 
140
  df_subset_docs['selected_texts'] = ''
141
  all_search_expressions = search_text.split(',')
142
  valid_search_expressions = []
143
+
144
  for word in all_search_expressions:
145
  word = format_text(word)
146
  if word != '':
147
  valid_search_expressions.append(word)
148
+ df_subset_docs = df_subset_docs[df_subset_docs['text_content'].str.contains(word, na=False)]
149
+
150
+ df_subset_docs['selected_texts'] = df_subset_docs['text_content'].apply(
151
+ lambda x: find_qualifying_groups(
152
+ text=x,
153
+ word_list=valid_search_expressions,
154
+ distance_threshold=search_distance
155
+ )
 
 
 
 
 
 
 
 
156
  )
157
+ df_subset_docs = df_subset_docs.dropna(subset=['selected_texts'])
158
+ st.session_state.df_results = df_subset_docs[['id', 'scan_date', 'city_name', 'region', 'selected_texts', 'url']].reset_index(drop=True)
159
+
160
+
161
+
162
+ if 'random_key' not in st.session_state:
163
+ st.session_state.random_key = random.random()
164
+
165
+ # Display persisted results
166
+ if "df_results" in st.session_state:
167
+
168
+ df_results = st.session_state.df_results
169
+
170
+ with st.container(border=True):
171
+ st.markdown("### Resultats")
172
+
173
+ def get_colors(row):
174
+ status = st.session_state.doc_status.get(row['id'], None)
175
+ if status == 'favorite':
176
+ color = ['background-color:#e6ffea'] * 6
177
+ elif status == 'backlog':
178
+ color = ['background-color:#ffe5e5'] * 6
179
+ else:
180
+ color=['background-color:#e6f0ff'] * 6
181
+ return color
182
+
183
+
184
+ event = st.dataframe(
185
+ df_results.style.apply(get_colors, axis=1),
186
+ column_config={
187
+ 'id':st.column_config.TextColumn("Id", disabled=True, width="small"),
188
+ 'scan_date': st.column_config.DateColumn("Date Scan", disabled=True, format="DD.MM.YYYY", width="small"),
189
+ 'city_name': st.column_config.TextColumn("Ville", disabled=True, width="small"),
190
+ 'region': st.column_config.TextColumn("Région", disabled=True, width="small"),
191
+ 'selected_texts': st.column_config.TextColumn("Extrait", disabled=True, width="large"),
192
+ 'url': st.column_config.LinkColumn("URL", disabled=True, width="medium"),
193
+ },
194
+ use_container_width=True,
195
+ on_select="rerun",
196
+ hide_index=True,
197
+ key=st.session_state.random_key,
198
+
199
+ selection_mode="multi-row",
200
+ #row_style=lambda row: "background-color: lightgreen;" if row["to_look_at"] else "background-color: #ffd6d6;"
201
+ )
202
+ #st.session_state.df_results = df_updated
203
+
204
+ _, col22, _, col33, _ = st.columns([0.36, 0.2,0.1, 0.2, 0.4])
205
+ button_add_doc_backlog = col22.button(':x:', use_container_width=True)
206
+ button_add_doc_interest = col33.button(':white_check_mark:', use_container_width=True)
207
+
208
+ if button_add_doc_backlog:
209
+ for element in event.selection['rows']:
210
+ st.session_state.doc_status[df_results.iloc[element]['id']] = 'backlog'
211
+ st.session_state.random_key = random.random()
212
+ st.rerun()
213
+ if button_add_doc_interest:
214
+ for element in event.selection['rows']:
215
+ st.session_state.doc_status[df_results.iloc[element]['id']] = 'favorite'
216
+ st.session_state.random_key = random.random()
217
+ st.rerun()
218
+
219
+ st.text('')
220
+ _, center_col_save, _ = st.columns([0.3,0.2, 0.35])
221
+ save_selection_to_s3 = center_col_save.button("Sauvergarder les labels", type="primary", use_container_width=True)
222
+ if save_selection_to_s3:
223
+ df_doc_status = pd.DataFrame(list(st.session_state.doc_status.items()), columns=['id', 'label'])
224
+ s3.to_parquet(df = df_doc_status, path = PATH_S3_LABELS)
225
+
226