Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -627,96 +627,100 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 627 |
return DocumentCheckResult(success=success, issues=incorrect_references)
|
| 628 |
|
| 629 |
@profile_performance
|
| 630 |
-
def document_title_check(self, doc_path
|
| 631 |
"""Check for correct formatting of document titles."""
|
| 632 |
try:
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
except Exception as e:
|
| 635 |
-
self.logger.error(f"Error
|
| 636 |
-
return DocumentCheckResult(
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
# Define formatting rules for different document types
|
| 641 |
-
formatting_rules = {
|
| 642 |
-
"Advisory Circular": {"italics": True, "quotes": False},
|
| 643 |
-
"Airworthiness Criteria": {"italics": False, "quotes": True},
|
| 644 |
-
"Deviation Memo": {"italics": False, "quotes": True},
|
| 645 |
-
"Exemption": {"italics": False, "quotes": True},
|
| 646 |
-
"Federal Register Notice": {"italics": False, "quotes": True},
|
| 647 |
-
"Order": {"italics": False, "quotes": True},
|
| 648 |
-
"Policy Statement": {"italics": False, "quotes": False},
|
| 649 |
-
"Rule": {"italics": False, "quotes": True},
|
| 650 |
-
"Special Condition": {"italics": False, "quotes": True},
|
| 651 |
-
"Technical Standard Order": {"italics": False, "quotes": True},
|
| 652 |
-
"Other": {"italics": False, "quotes": False}
|
| 653 |
-
}
|
| 654 |
-
|
| 655 |
-
if doc_type not in formatting_rules:
|
| 656 |
-
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
|
| 657 |
-
return DocumentCheckResult(success=True, issues=[])
|
| 658 |
-
|
| 659 |
-
required_format = formatting_rules[doc_type]
|
| 660 |
-
|
| 661 |
-
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
|
| 662 |
-
|
| 663 |
-
for paragraph in doc.paragraphs:
|
| 664 |
-
text = paragraph.text
|
| 665 |
-
matches = ac_pattern.finditer(text)
|
| 666 |
-
|
| 667 |
-
for match in matches:
|
| 668 |
-
full_match = match.group(0)
|
| 669 |
-
title_text = match.group(2).strip()
|
| 670 |
-
|
| 671 |
-
# Get the position where the title starts
|
| 672 |
-
title_start = match.start(2)
|
| 673 |
-
title_end = match.end(2)
|
| 674 |
-
|
| 675 |
-
# Check for any type of quotation marks, including smart quotes
|
| 676 |
-
title_in_quotes = any(q in title_text for q in ['"', "'", '“', '”', '‘', '’'])
|
| 677 |
-
|
| 678 |
-
# Check the formatting of the title
|
| 679 |
-
title_is_italicized = False
|
| 680 |
-
current_pos = 0
|
| 681 |
-
for run in paragraph.runs:
|
| 682 |
-
run_length = len(run.text)
|
| 683 |
-
run_start = current_pos
|
| 684 |
-
run_end = current_pos + run_length
|
| 685 |
-
if run_start <= title_start < run_end:
|
| 686 |
-
title_is_italicized = run.italic
|
| 687 |
-
break
|
| 688 |
-
current_pos += run_length
|
| 689 |
-
|
| 690 |
-
# Check if formatting matches the required format
|
| 691 |
-
formatting_incorrect = False
|
| 692 |
-
issue_message = []
|
| 693 |
-
|
| 694 |
-
# Check italics requirement
|
| 695 |
-
if required_format["italics"] and not title_is_italicized:
|
| 696 |
-
formatting_incorrect = True
|
| 697 |
-
issue_message.append("should be italicized")
|
| 698 |
-
elif not required_format["italics"] and title_is_italicized:
|
| 699 |
-
formatting_incorrect = True
|
| 700 |
-
issue_message.append("should not be italicized")
|
| 701 |
-
|
| 702 |
-
# Check quotes requirement
|
| 703 |
-
if required_format["quotes"] and not title_in_quotes:
|
| 704 |
-
formatting_incorrect = True
|
| 705 |
-
issue_message.append("should be in quotes")
|
| 706 |
-
elif not required_format["quotes"] and title_in_quotes:
|
| 707 |
-
formatting_incorrect = True
|
| 708 |
-
issue_message.append("should not be in quotes")
|
| 709 |
-
|
| 710 |
-
if formatting_incorrect:
|
| 711 |
-
incorrect_titles.append({
|
| 712 |
-
'text': title_text,
|
| 713 |
-
'issue': ', '.join(issue_message),
|
| 714 |
-
'sentence': text.strip()
|
| 715 |
-
})
|
| 716 |
-
|
| 717 |
-
success = len(incorrect_titles) == 0
|
| 718 |
-
|
| 719 |
-
return DocumentCheckResult(success=success, issues=incorrect_titles)
|
| 720 |
|
| 721 |
@profile_performance
|
| 722 |
def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
|
|
@@ -933,10 +937,17 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 933 |
def process_document(file_obj, doc_type, template_type):
|
| 934 |
"""Process the document and run all checks."""
|
| 935 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
checker = FAADocumentChecker()
|
| 937 |
doc = Document(file_obj)
|
| 938 |
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
|
| 939 |
|
|
|
|
|
|
|
|
|
|
| 940 |
# Run all checks
|
| 941 |
results = {}
|
| 942 |
results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
|
|
@@ -957,6 +968,7 @@ def process_document(file_obj, doc_type, template_type):
|
|
| 957 |
return format_results_for_gradio(results, doc_type)
|
| 958 |
except Exception as e:
|
| 959 |
print(f"Error in process_document: {str(e)}")
|
|
|
|
| 960 |
return f"An error occurred while processing the document: {str(e)}"
|
| 961 |
|
| 962 |
def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
|
|
|
|
| 627 |
return DocumentCheckResult(success=success, issues=incorrect_references)
|
| 628 |
|
| 629 |
@profile_performance
|
| 630 |
+
def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult:
|
| 631 |
"""Check for correct formatting of document titles."""
|
| 632 |
try:
|
| 633 |
+
# Handle both file paths and BytesIO objects
|
| 634 |
+
if isinstance(doc_path, (str, bytes, io.BytesIO)):
|
| 635 |
+
doc = Document(doc_path)
|
| 636 |
+
else:
|
| 637 |
+
return DocumentCheckResult(
|
| 638 |
+
success=False,
|
| 639 |
+
issues=[{'error': 'Invalid document input type'}]
|
| 640 |
+
)
|
| 641 |
+
|
| 642 |
+
# Rest of the method remains the same
|
| 643 |
+
incorrect_titles = []
|
| 644 |
+
|
| 645 |
+
# Define formatting rules for different document types
|
| 646 |
+
formatting_rules = {
|
| 647 |
+
"Advisory Circular": {"italics": True, "quotes": False},
|
| 648 |
+
"Airworthiness Criteria": {"italics": False, "quotes": True},
|
| 649 |
+
"Deviation Memo": {"italics": False, "quotes": True},
|
| 650 |
+
"Exemption": {"italics": False, "quotes": True},
|
| 651 |
+
"Federal Register Notice": {"italics": False, "quotes": True},
|
| 652 |
+
"Order": {"italics": False, "quotes": True},
|
| 653 |
+
"Policy Statement": {"italics": False, "quotes": False},
|
| 654 |
+
"Rule": {"italics": False, "quotes": True},
|
| 655 |
+
"Special Condition": {"italics": False, "quotes": True},
|
| 656 |
+
"Technical Standard Order": {"italics": False, "quotes": True},
|
| 657 |
+
"Other": {"italics": False, "quotes": False}
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
if doc_type not in formatting_rules:
|
| 661 |
+
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
|
| 662 |
+
return DocumentCheckResult(success=True, issues=[])
|
| 663 |
+
|
| 664 |
+
required_format = formatting_rules[doc_type]
|
| 665 |
+
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
|
| 666 |
+
|
| 667 |
+
for paragraph in doc.paragraphs:
|
| 668 |
+
text = paragraph.text
|
| 669 |
+
matches = ac_pattern.finditer(text)
|
| 670 |
+
|
| 671 |
+
for match in matches:
|
| 672 |
+
full_match = match.group(0)
|
| 673 |
+
title_text = match.group(2).strip()
|
| 674 |
+
title_start = match.start(2)
|
| 675 |
+
title_end = match.end(2)
|
| 676 |
+
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
|
| 677 |
+
|
| 678 |
+
title_is_italicized = False
|
| 679 |
+
current_pos = 0
|
| 680 |
+
for run in paragraph.runs:
|
| 681 |
+
run_length = len(run.text)
|
| 682 |
+
run_start = current_pos
|
| 683 |
+
run_end = current_pos + run_length
|
| 684 |
+
if run_start <= title_start < run_end:
|
| 685 |
+
title_is_italicized = run.italic
|
| 686 |
+
break
|
| 687 |
+
current_pos += run_length
|
| 688 |
+
|
| 689 |
+
formatting_incorrect = False
|
| 690 |
+
issue_message = []
|
| 691 |
+
|
| 692 |
+
if required_format["italics"] and not title_is_italicized:
|
| 693 |
+
formatting_incorrect = True
|
| 694 |
+
issue_message.append("should be italicized")
|
| 695 |
+
elif not required_format["italics"] and title_is_italicized:
|
| 696 |
+
formatting_incorrect = True
|
| 697 |
+
issue_message.append("should not be italicized")
|
| 698 |
+
|
| 699 |
+
if required_format["quotes"] and not title_in_quotes:
|
| 700 |
+
formatting_incorrect = True
|
| 701 |
+
issue_message.append("should be in quotes")
|
| 702 |
+
elif not required_format["quotes"] and title_in_quotes:
|
| 703 |
+
formatting_incorrect = True
|
| 704 |
+
issue_message.append("should not be in quotes")
|
| 705 |
+
|
| 706 |
+
if formatting_incorrect:
|
| 707 |
+
incorrect_titles.append({
|
| 708 |
+
'text': title_text,
|
| 709 |
+
'issue': ', '.join(issue_message),
|
| 710 |
+
'sentence': text.strip()
|
| 711 |
+
})
|
| 712 |
+
|
| 713 |
+
return DocumentCheckResult(
|
| 714 |
+
success=len(incorrect_titles) == 0,
|
| 715 |
+
issues=incorrect_titles
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
except Exception as e:
|
| 719 |
+
self.logger.error(f"Error in document_title_check: {e}")
|
| 720 |
+
return DocumentCheckResult(
|
| 721 |
+
success=False,
|
| 722 |
+
issues=[{'error': str(e)}]
|
| 723 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
|
| 725 |
@profile_performance
|
| 726 |
def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
|
|
|
|
| 937 |
def process_document(file_obj, doc_type, template_type):
|
| 938 |
"""Process the document and run all checks."""
|
| 939 |
try:
|
| 940 |
+
# Convert file object to BytesIO
|
| 941 |
+
if isinstance(file_obj, bytes):
|
| 942 |
+
file_obj = io.BytesIO(file_obj)
|
| 943 |
+
|
| 944 |
checker = FAADocumentChecker()
|
| 945 |
doc = Document(file_obj)
|
| 946 |
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
|
| 947 |
|
| 948 |
+
# Rewind the file object for additional processing
|
| 949 |
+
file_obj.seek(0)
|
| 950 |
+
|
| 951 |
# Run all checks
|
| 952 |
results = {}
|
| 953 |
results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
|
|
|
|
| 968 |
return format_results_for_gradio(results, doc_type)
|
| 969 |
except Exception as e:
|
| 970 |
print(f"Error in process_document: {str(e)}")
|
| 971 |
+
traceback.print_exc() # This will print the full traceback
|
| 972 |
return f"An error occurred while processing the document: {str(e)}"
|
| 973 |
|
| 974 |
def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
|