Update app.py
Browse files
app.py
CHANGED
|
@@ -45,22 +45,38 @@ def extract_text(image):
|
|
| 45 |
extracted_text.append(line[1][0])
|
| 46 |
return "\n".join(extracted_text)
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# Function to extract attributes using regex
|
| 49 |
def extract_attributes(extracted_text):
|
| 50 |
attributes = {}
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# Patterns for extracting personal information
|
| 53 |
patterns = {
|
| 54 |
-
"Name": r"Name[:\-]?\s*([A-Za-z\s]
|
| 55 |
"Age": r"Age[:\-]?\s*(\d{1,3})",
|
| 56 |
"Gender": r"Gender[:\-]?\s*(Male|Female|Other)",
|
| 57 |
"Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)[:\-]?\s*(?:\+91)?([6-9]\d{9})"
|
| 58 |
}
|
| 59 |
|
| 60 |
for readable_attr, pattern in patterns.items():
|
| 61 |
-
match = re.search(pattern,
|
| 62 |
if match:
|
| 63 |
attributes[readable_attr] = match.group(1).strip()
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
if "Gender" in attributes:
|
| 66 |
attributes["Gender"] = GENDER_MAPPING.get(attributes["Gender"], attributes["Gender"])
|
|
|
|
| 45 |
extracted_text.append(line[1][0])
|
| 46 |
return "\n".join(extracted_text)
|
| 47 |
|
| 48 |
+
# Function to clean extracted text
|
| 49 |
+
def clean_extracted_text(text):
|
| 50 |
+
# Replace carriage returns and normalize newlines
|
| 51 |
+
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
| 52 |
+
# Strip leading/trailing whitespace and normalize multiple spaces
|
| 53 |
+
text = re.sub(r'\s+', ' ', text.strip())
|
| 54 |
+
return text
|
| 55 |
+
|
| 56 |
# Function to extract attributes using regex
|
| 57 |
def extract_attributes(extracted_text):
|
| 58 |
attributes = {}
|
| 59 |
|
| 60 |
+
# Clean the extracted text
|
| 61 |
+
cleaned_text = clean_extracted_text(extracted_text)
|
| 62 |
+
print(f"Raw extracted text: '{extracted_text}'")
|
| 63 |
+
print(f"Cleaned extracted text: '{cleaned_text}'")
|
| 64 |
+
|
| 65 |
# Patterns for extracting personal information
|
| 66 |
patterns = {
|
| 67 |
+
"Name": r"Name[:\-]?\s*([A-Za-z\s]+)(?=\s*(?:Age|Gender|Phone Number|Phone|Mobile|$|\n|\r\n|\Z))",
|
| 68 |
"Age": r"Age[:\-]?\s*(\d{1,3})",
|
| 69 |
"Gender": r"Gender[:\-]?\s*(Male|Female|Other)",
|
| 70 |
"Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)[:\-]?\s*(?:\+91)?([6-9]\d{9})"
|
| 71 |
}
|
| 72 |
|
| 73 |
for readable_attr, pattern in patterns.items():
|
| 74 |
+
match = re.search(pattern, cleaned_text, re.IGNORECASE)
|
| 75 |
if match:
|
| 76 |
attributes[readable_attr] = match.group(1).strip()
|
| 77 |
+
print(f"Extracted {readable_attr}: '{attributes[readable_attr]}'")
|
| 78 |
+
else:
|
| 79 |
+
print(f"No match for {readable_attr} with pattern: {pattern}")
|
| 80 |
|
| 81 |
if "Gender" in attributes:
|
| 82 |
attributes["Gender"] = GENDER_MAPPING.get(attributes["Gender"], attributes["Gender"])
|