Christopher Román Jaimes
commited on
Commit
·
ac89c56
1
Parent(s):
e2740ad
feat: add extract a valid string number left dotted.
Browse files
app.py
CHANGED
|
@@ -33,14 +33,19 @@ def format_gliner_predictions(prediction):
|
|
| 33 |
.sort_values("score", ascending = False)\
|
| 34 |
.drop_duplicates(subset = "label", keep = "first")
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
# Add Columns Label for Text and Probability
|
| 37 |
prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
|
| 38 |
prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
|
|
|
|
| 39 |
|
| 40 |
# Format Predictions
|
| 41 |
entities = prediction_df.set_index("label_text")["text"].to_dict()
|
| 42 |
entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
|
| 43 |
-
|
|
|
|
| 44 |
|
| 45 |
return predictions_formatted
|
| 46 |
else:
|
|
@@ -166,6 +171,41 @@ def extract_remodeling_year_from_string(string):
|
|
| 166 |
|
| 167 |
return None
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
# Cleaning
|
| 170 |
clean_functions_dict = {
|
| 171 |
"SUPERFICIE_TERRAZA": extract_surface_from_string,
|
|
@@ -249,6 +289,11 @@ def generate_answer(text):
|
|
| 249 |
# Format Prediction Entities
|
| 250 |
entities_formatted = format_gliner_predictions(entities)
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
# Clean Entities
|
| 253 |
entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
|
| 254 |
entities_cleaned = dict()
|
|
|
|
| 33 |
.sort_values("score", ascending = False)\
|
| 34 |
.drop_duplicates(subset = "label", keep = "first")
|
| 35 |
|
| 36 |
+
# Add Position Column
|
| 37 |
+
prediction_df["position"] = prediction_df.apply(lambda x: (x["start"], x["end"]) ,axis = 1)
|
| 38 |
+
|
| 39 |
# Add Columns Label for Text and Probability
|
| 40 |
prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
|
| 41 |
prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
|
| 42 |
+
prediction_df["label_position"] = prediction_df["label"].apply(lambda x: f"pos_{x}")
|
| 43 |
|
| 44 |
# Format Predictions
|
| 45 |
entities = prediction_df.set_index("label_text")["text"].to_dict()
|
| 46 |
entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
|
| 47 |
+
entities_positions = prediction_df.set_index("label_position")["position"].to_dict()
|
| 48 |
+
predictions_formatted = {**entities, **entities_probs, **entities_positions}
|
| 49 |
|
| 50 |
return predictions_formatted
|
| 51 |
else:
|
|
|
|
| 171 |
|
| 172 |
return None
|
| 173 |
|
| 174 |
+
def extract_valid_string_left_dotted(string, text, pos):
|
| 175 |
+
if isinstance(string, str):
|
| 176 |
+
# String Position
|
| 177 |
+
left_pos, rigth_pos = pos
|
| 178 |
+
|
| 179 |
+
# Verify if the Left Position is not too close to the beginning of the text.
|
| 180 |
+
if left_pos < 5:
|
| 181 |
+
return None
|
| 182 |
+
|
| 183 |
+
if string[0].isdigit():
|
| 184 |
+
# 1. Take a subtext with 5 more characters to the left of the string.
|
| 185 |
+
sub_text = text[left_pos - 5: rigth_pos]
|
| 186 |
+
|
| 187 |
+
# 2. If the string has no dots to the left, return the original string.
|
| 188 |
+
if text[left_pos - 1] == ".":
|
| 189 |
+
|
| 190 |
+
# 3. If the string has a left dot but no preceding digit, return the original string.
|
| 191 |
+
if text[left_pos - 2].isdigit():
|
| 192 |
+
|
| 193 |
+
# 4. If the string has a left dot, with 3 left digits, and the fourth left value isn't ',', '.', or "''", it returns the new string.
|
| 194 |
+
pattern = r'^(?![\d.,])\D*\d{1,3}\.' + re.escape(string)
|
| 195 |
+
match = re.search(pattern, sub_text)
|
| 196 |
+
if match:
|
| 197 |
+
return match.group(0)
|
| 198 |
+
else:
|
| 199 |
+
return None
|
| 200 |
+
else:
|
| 201 |
+
return string
|
| 202 |
+
else:
|
| 203 |
+
return string
|
| 204 |
+
else:
|
| 205 |
+
return string
|
| 206 |
+
else:
|
| 207 |
+
return None
|
| 208 |
+
|
| 209 |
# Cleaning
|
| 210 |
clean_functions_dict = {
|
| 211 |
"SUPERFICIE_TERRAZA": extract_surface_from_string,
|
|
|
|
| 289 |
# Format Prediction Entities
|
| 290 |
entities_formatted = format_gliner_predictions(entities)
|
| 291 |
|
| 292 |
+
# Extract valid string left dotted
|
| 293 |
+
feature_surfaces = ['SUPERFICIE_BALCON', 'SUPERFICIE_TERRAZA', 'SUPERFICIE_JARDIN', 'SUPERFICIE_TERRENO', 'SUPERFICIE_HABITABLE']
|
| 294 |
+
for feature_name in feature_surfaces:
|
| 295 |
+
entities_formatted[f"pred_{feature_name}"] = extract_valid_string_left_dotted(entities_formatted[f"pred_{feature_name}"], text, entities_formatted[f"pos_{feature_name}"])
|
| 296 |
+
|
| 297 |
# Clean Entities
|
| 298 |
entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
|
| 299 |
entities_cleaned = dict()
|