Spaces:

chris32
/

Text-Intelligence-Real-State

Sleeping

App Files Files Community

Christopher Román Jaimes commited on May 28, 2024

Commit

ac89c56

1 Parent(s): e2740ad

feat: add extract a valid string number left dotted.

Browse files

Files changed (1) hide show

app.py +46 -1

app.py CHANGED Viewed

@@ -33,14 +33,19 @@ def format_gliner_predictions(prediction):
                           .sort_values("score", ascending = False)\
                           .drop_duplicates(subset = "label", keep = "first")
         # Add Columns Label for Text and Probability
         prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
         prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
         # Format Predictions
         entities = prediction_df.set_index("label_text")["text"].to_dict()
         entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
-        predictions_formatted = {**entities, **entities_probs}
         return predictions_formatted
     else:
@@ -166,6 +171,41 @@ def extract_remodeling_year_from_string(string):
     return None
 # Cleaning
 clean_functions_dict = {
     "SUPERFICIE_TERRAZA": extract_surface_from_string,
@@ -249,6 +289,11 @@ def generate_answer(text):
     # Format Prediction Entities
     entities_formatted = format_gliner_predictions(entities)
     # Clean Entities
     entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
     entities_cleaned = dict()

                           .sort_values("score", ascending = False)\
                           .drop_duplicates(subset = "label", keep = "first")
+        # Add Position Column
+        prediction_df["position"] = prediction_df.apply(lambda x: (x["start"], x["end"]) ,axis = 1)
         # Add Columns Label for Text and Probability
         prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
         prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
+        prediction_df["label_position"] = prediction_df["label"].apply(lambda x: f"pos_{x}")
         # Format Predictions
         entities = prediction_df.set_index("label_text")["text"].to_dict()
         entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
+        entities_positions = prediction_df.set_index("label_position")["position"].to_dict()
+        predictions_formatted = {**entities, **entities_probs, **entities_positions}
         return predictions_formatted
     else:
     return None
+def extract_valid_string_left_dotted(string, text, pos):
+    if isinstance(string, str):
+        # String Position
+        left_pos, rigth_pos = pos
+        # Verify if the Left Position is not too close to the beginning of the text.
+        if left_pos < 5:
+            return None
+        if string[0].isdigit():
+            # 1. Take a subtext with 5 more characters to the left of the string.
+            sub_text = text[left_pos - 5: rigth_pos]
+            # 2. If the string has no dots to the left, return the original string.
+            if text[left_pos - 1] == ".":
+                # 3. If the string has a left dot but no preceding digit, return the original string.
+                if text[left_pos - 2].isdigit():
+                    # 4. If the string has a left dot, with 3 left digits, and the fourth left value isn't ',', '.', or "''", it returns the new string.
+                    pattern = r'^(?![\d.,])\D*\d{1,3}\.' + re.escape(string)
+                    match = re.search(pattern, sub_text)
+                    if match:
+                        return match.group(0)
+                    else:
+                        return None
+                else:
+                    return string
+            else:
+                return string
+        else:
+            return string
+    else:
+        return None
 # Cleaning
 clean_functions_dict = {
     "SUPERFICIE_TERRAZA": extract_surface_from_string,
     # Format Prediction Entities
     entities_formatted = format_gliner_predictions(entities)
+    # Extract valid string left dotted
+    feature_surfaces = ['SUPERFICIE_BALCON', 'SUPERFICIE_TERRAZA', 'SUPERFICIE_JARDIN', 'SUPERFICIE_TERRENO', 'SUPERFICIE_HABITABLE']
+    for feature_name in feature_surfaces:
+        entities_formatted[f"pred_{feature_name}"] = extract_valid_string_left_dotted(entities_formatted[f"pred_{feature_name}"], text, entities_formatted[f"pos_{feature_name}"])
     # Clean Entities
     entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
     entities_cleaned = dict()