Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -288,6 +288,28 @@ button:hover {
|
|
| 288 |
border-radius: 10px;
|
| 289 |
background-color: rgba(255,255,255,0.3);
|
| 290 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
/* Hide elements */
|
| 292 |
footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
|
| 293 |
#sentiment-analysis, #risk-visualization {
|
|
@@ -314,6 +336,13 @@ footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsett
|
|
| 314 |
.dark .count-item:hover {
|
| 315 |
background-color: rgba(255,255,255,0.05);
|
| 316 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
"""
|
| 318 |
|
| 319 |
# Salesforce credentials
|
|
@@ -551,18 +580,59 @@ def extract_text_from_pdf(pdf_path: str) -> str:
|
|
| 551 |
for page in pdf.pages:
|
| 552 |
page_text = page.extract_text()
|
| 553 |
if page_text:
|
| 554 |
-
text += page_text
|
| 555 |
return text
|
| 556 |
except Exception as e:
|
| 557 |
logger.error(f"PDF text extraction failed: {str(e)}")
|
| 558 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
| 559 |
|
| 560 |
-
def
|
| 561 |
-
"""
|
| 562 |
-
|
|
|
|
|
|
|
| 563 |
for keyword in keywords:
|
| 564 |
-
|
| 565 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
|
| 567 |
def find_penalty_values(text: str) -> List[float]:
|
| 568 |
"""Find penalty amounts in the text"""
|
|
@@ -692,6 +762,23 @@ def format_clause_example(example: str, index: int) -> str:
|
|
| 692 |
</div>
|
| 693 |
"""
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
def analyze_pdf(file_obj) -> List:
|
| 696 |
"""Main analysis function for Gradio interface"""
|
| 697 |
try:
|
|
@@ -707,8 +794,6 @@ def analyze_pdf(file_obj) -> List:
|
|
| 707 |
text = extract_text_from_pdf(file_obj.name)
|
| 708 |
if not text.strip():
|
| 709 |
raise Exception("No text extracted from PDF. It might be a scanned document.")
|
| 710 |
-
# Split text into lines for line number tracking
|
| 711 |
-
lines = text.split('\n')
|
| 712 |
except Exception as e:
|
| 713 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
| 714 |
|
|
@@ -722,15 +807,16 @@ def analyze_pdf(file_obj) -> List:
|
|
| 722 |
obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
|
| 723 |
delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
|
| 724 |
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
|
|
|
| 728 |
|
| 729 |
-
|
|
|
|
|
|
|
| 730 |
|
| 731 |
-
|
| 732 |
-
total_obligations = sum(obligation_counts.values())
|
| 733 |
-
total_delays = sum(delay_counts.values())
|
| 734 |
|
| 735 |
# Generate warning messages with emojis
|
| 736 |
penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
|
|
@@ -751,49 +837,60 @@ def analyze_pdf(file_obj) -> List:
|
|
| 751 |
except Exception as e:
|
| 752 |
raise Exception(f"Visual generation failed: {str(e)}")
|
| 753 |
|
| 754 |
-
#
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
|
|
|
| 765 |
{penalty_warning}
|
| 766 |
<div class='penalty-box'>
|
| 767 |
<div class='section-title'>💰 Penalty Clause Details</div>
|
| 768 |
-
{"".join(
|
| 769 |
-
</div>
|
| 770 |
-
<div class='penalty-box'>
|
| 771 |
-
<div class='section-title'>💰 Detailed Penalty Line References</div>
|
| 772 |
-
{"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--danger-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in penalty_line_refs.items()]) or '<div class="success-box">✅ No penalty keywords found in specific lines.</div>'}
|
| 773 |
</div>
|
| 774 |
"""
|
| 775 |
|
| 776 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
{obligation_warning}
|
| 778 |
<div class='obligation-box'>
|
| 779 |
<div class='section-title'>📝 Obligation Clause Details</div>
|
| 780 |
-
{"".join(
|
| 781 |
-
</div>
|
| 782 |
-
<div class='obligation-box'>
|
| 783 |
-
<div class='section-title'>📝 Detailed Obligation Line References</div>
|
| 784 |
-
{"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--warning-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in obligation_line_refs.items()]) or '<div class="success-box">✅ No obligation keywords found in specific lines.</div>'}
|
| 785 |
</div>
|
| 786 |
"""
|
| 787 |
|
| 788 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
{delay_warning}
|
| 790 |
<div class='delay-box'>
|
| 791 |
<div class='section-title'>⏱ Delay Clause Details</div>
|
| 792 |
-
{"".join(
|
| 793 |
-
</div>
|
| 794 |
-
<div class='delay-box'>
|
| 795 |
-
<div class='section-title'>⏱ Detailed Delay Line References</div>
|
| 796 |
-
{"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--info-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in delay_line_refs.items()]) or '<div class="success-box">✅ No delay keywords found in specific lines.</div>'}
|
| 797 |
</div>
|
| 798 |
"""
|
| 799 |
|
|
@@ -813,10 +910,10 @@ def analyze_pdf(file_obj) -> List:
|
|
| 813 |
'risk_level': risk_level,
|
| 814 |
'record_id': record_id,
|
| 815 |
'penalty_examples': extracted_data,
|
| 816 |
-
'penalty_details': "\n".join([f"{kw}: {count}" for kw,
|
| 817 |
'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
|
| 818 |
-
'obligation_details': "\n".join([f"{kw}: {count}" for kw,
|
| 819 |
-
'delay_details': "\n".join([f"{kw}: {count}" for kw,
|
| 820 |
}
|
| 821 |
|
| 822 |
try:
|
|
@@ -896,10 +993,10 @@ def analyze_pdf(file_obj) -> List:
|
|
| 896 |
</div>
|
| 897 |
""",
|
| 898 |
"", # Empty string for hidden risk visualization
|
| 899 |
-
|
| 900 |
f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
|
| 901 |
-
|
| 902 |
-
|
| 903 |
f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
|
| 904 |
sentiment_analysis_output,
|
| 905 |
temp_file_path # Return temporary file path for PDF download
|
|
|
|
| 288 |
border-radius: 10px;
|
| 289 |
background-color: rgba(255,255,255,0.3);
|
| 290 |
}
|
| 291 |
+
.keyword-match {
|
| 292 |
+
background-color: rgba(255, 255, 0, 0.3);
|
| 293 |
+
padding: 2px 4px;
|
| 294 |
+
border-radius: 3px;
|
| 295 |
+
font-weight: bold;
|
| 296 |
+
}
|
| 297 |
+
.match-detail {
|
| 298 |
+
margin-top: 5px;
|
| 299 |
+
padding: 8px;
|
| 300 |
+
background-color: rgba(0,0,0,0.05);
|
| 301 |
+
border-radius: 5px;
|
| 302 |
+
font-size: 14px;
|
| 303 |
+
}
|
| 304 |
+
.match-line {
|
| 305 |
+
font-family: monospace;
|
| 306 |
+
white-space: pre-wrap;
|
| 307 |
+
margin-bottom: 5px;
|
| 308 |
+
}
|
| 309 |
+
.match-context {
|
| 310 |
+
font-style: italic;
|
| 311 |
+
color: var(--secondary-color);
|
| 312 |
+
}
|
| 313 |
/* Hide elements */
|
| 314 |
footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
|
| 315 |
#sentiment-analysis, #risk-visualization {
|
|
|
|
| 336 |
.dark .count-item:hover {
|
| 337 |
background-color: rgba(255,255,255,0.05);
|
| 338 |
}
|
| 339 |
+
.dark .keyword-match {
|
| 340 |
+
background-color: rgba(255, 255, 0, 0.5);
|
| 341 |
+
color: black;
|
| 342 |
+
}
|
| 343 |
+
.dark .match-detail {
|
| 344 |
+
background-color: rgba(255,255,255,0.05);
|
| 345 |
+
}
|
| 346 |
"""
|
| 347 |
|
| 348 |
# Salesforce credentials
|
|
|
|
| 580 |
for page in pdf.pages:
|
| 581 |
page_text = page.extract_text()
|
| 582 |
if page_text:
|
| 583 |
+
text += page_text + "\n" # Add newline between pages
|
| 584 |
return text
|
| 585 |
except Exception as e:
|
| 586 |
logger.error(f"PDF text extraction failed: {str(e)}")
|
| 587 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
| 588 |
|
| 589 |
+
def find_keyword_matches(text: str, keywords: List[str]) -> Dict[str, List[Dict[str, str]]]:
|
| 590 |
+
"""Find all matches for keywords in text with line numbers and context"""
|
| 591 |
+
matches = {}
|
| 592 |
+
lines = text.split('\n')
|
| 593 |
+
|
| 594 |
for keyword in keywords:
|
| 595 |
+
keyword_matches = []
|
| 596 |
+
pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags=re.IGNORECASE)
|
| 597 |
+
|
| 598 |
+
for line_num, line in enumerate(lines, 1):
|
| 599 |
+
line_matches = pattern.finditer(line)
|
| 600 |
+
for match in line_matches:
|
| 601 |
+
start = max(0, match.start() - 20)
|
| 602 |
+
end = min(len(line), match.end() + 20)
|
| 603 |
+
context = line[start:end]
|
| 604 |
+
|
| 605 |
+
# Highlight the matched keyword in the context
|
| 606 |
+
highlighted_context = (
|
| 607 |
+
context[:match.start()-start] +
|
| 608 |
+
f"<span class='keyword-match'>{context[match.start()-start:match.end()-start]}</span>" +
|
| 609 |
+
context[match.end()-start:]
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
keyword_matches.append({
|
| 613 |
+
'line_number': line_num,
|
| 614 |
+
'full_line': line.strip(),
|
| 615 |
+
'context': highlighted_context,
|
| 616 |
+
'match': match.group()
|
| 617 |
+
})
|
| 618 |
+
|
| 619 |
+
matches[keyword] = keyword_matches
|
| 620 |
+
|
| 621 |
+
return matches
|
| 622 |
+
|
| 623 |
+
def count_keywords_with_details(text: str, keywords: List[str]) -> Dict[str, Dict]:
|
| 624 |
+
"""Count keyword occurrences with detailed match information"""
|
| 625 |
+
keyword_details = {}
|
| 626 |
+
matches = find_keyword_matches(text, keywords)
|
| 627 |
+
|
| 628 |
+
for keyword in keywords:
|
| 629 |
+
keyword_matches = matches.get(keyword, [])
|
| 630 |
+
keyword_details[keyword] = {
|
| 631 |
+
'count': len(keyword_matches),
|
| 632 |
+
'matches': keyword_matches
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
return keyword_details
|
| 636 |
|
| 637 |
def find_penalty_values(text: str) -> List[float]:
|
| 638 |
"""Find penalty amounts in the text"""
|
|
|
|
| 762 |
</div>
|
| 763 |
"""
|
| 764 |
|
| 765 |
+
def format_keyword_matches(matches: List[Dict[str, str]]) -> str:
|
| 766 |
+
"""Format keyword matches with line numbers and context"""
|
| 767 |
+
if not matches:
|
| 768 |
+
return "<div class='success-box'>✅ No matches found for this keyword</div>"
|
| 769 |
+
|
| 770 |
+
result = []
|
| 771 |
+
for i, match in enumerate(matches[:5], 1): # Limit to top 5 matches per keyword
|
| 772 |
+
result.append(f"""
|
| 773 |
+
<div class="match-detail">
|
| 774 |
+
<div><strong>Match {i}:</strong> Line {match['line_number']}</div>
|
| 775 |
+
<div class="match-context">Context: {match['context']}</div>
|
| 776 |
+
<div class="match-line">Full line: {match['full_line']}</div>
|
| 777 |
+
</div>
|
| 778 |
+
""")
|
| 779 |
+
|
| 780 |
+
return "".join(result)
|
| 781 |
+
|
| 782 |
def analyze_pdf(file_obj) -> List:
|
| 783 |
"""Main analysis function for Gradio interface"""
|
| 784 |
try:
|
|
|
|
| 794 |
text = extract_text_from_pdf(file_obj.name)
|
| 795 |
if not text.strip():
|
| 796 |
raise Exception("No text extracted from PDF. It might be a scanned document.")
|
|
|
|
|
|
|
| 797 |
except Exception as e:
|
| 798 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
| 799 |
|
|
|
|
| 807 |
obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
|
| 808 |
delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
|
| 809 |
|
| 810 |
+
# Get detailed keyword matches with line numbers and context
|
| 811 |
+
penalty_details = count_keywords_with_details(text, penalty_keywords)
|
| 812 |
+
obligation_details = count_keywords_with_details(text, obligation_keywords)
|
| 813 |
+
delay_details = count_keywords_with_details(text, delay_keywords)
|
| 814 |
|
| 815 |
+
total_penalties = sum(details['count'] for details in penalty_details.values())
|
| 816 |
+
total_obligations = sum(details['count'] for details in obligation_details.values())
|
| 817 |
+
total_delays = sum(details['count'] for details in delay_details.values())
|
| 818 |
|
| 819 |
+
penalty_values = find_penalty_values(text)
|
|
|
|
|
|
|
| 820 |
|
| 821 |
# Generate warning messages with emojis
|
| 822 |
penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
|
|
|
|
| 837 |
except Exception as e:
|
| 838 |
raise Exception(f"Visual generation failed: {str(e)}")
|
| 839 |
|
| 840 |
+
# Format penalty details with match information
|
| 841 |
+
penalty_html = []
|
| 842 |
+
for keyword, details in penalty_details.items():
|
| 843 |
+
penalty_html.append(f"""
|
| 844 |
+
<div class='count-item'>
|
| 845 |
+
<span class='count-label'><span style='color: var(--danger-color)'>•</span> {keyword}</span>
|
| 846 |
+
<span class='count-value'>{details['count']}</span>
|
| 847 |
+
</div>
|
| 848 |
+
{format_keyword_matches(details['matches'])}
|
| 849 |
+
""")
|
| 850 |
+
|
| 851 |
+
penalty_details_html = f"""
|
| 852 |
{penalty_warning}
|
| 853 |
<div class='penalty-box'>
|
| 854 |
<div class='section-title'>💰 Penalty Clause Details</div>
|
| 855 |
+
{"".join(penalty_html)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 856 |
</div>
|
| 857 |
"""
|
| 858 |
|
| 859 |
+
# Format obligation details with match information
|
| 860 |
+
obligation_html = []
|
| 861 |
+
for keyword, details in obligation_details.items():
|
| 862 |
+
obligation_html.append(f"""
|
| 863 |
+
<div class='count-item'>
|
| 864 |
+
<span class='count-label'><span style='color: var(--warning-color)'>•</span> {keyword}</span>
|
| 865 |
+
<span class='count-value'>{details['count']}</span>
|
| 866 |
+
</div>
|
| 867 |
+
{format_keyword_matches(details['matches'])}
|
| 868 |
+
""")
|
| 869 |
+
|
| 870 |
+
obligation_details_html = f"""
|
| 871 |
{obligation_warning}
|
| 872 |
<div class='obligation-box'>
|
| 873 |
<div class='section-title'>📝 Obligation Clause Details</div>
|
| 874 |
+
{"".join(obligation_html)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
</div>
|
| 876 |
"""
|
| 877 |
|
| 878 |
+
# Format delay details with match information
|
| 879 |
+
delay_html = []
|
| 880 |
+
for keyword, details in delay_details.items():
|
| 881 |
+
delay_html.append(f"""
|
| 882 |
+
<div class='count-item'>
|
| 883 |
+
<span class='count-label'><span style='color: var(--info-color)'>•</span> {keyword}</span>
|
| 884 |
+
<span class='count-value'>{details['count']}</span>
|
| 885 |
+
</div>
|
| 886 |
+
{format_keyword_matches(details['matches'])}
|
| 887 |
+
""")
|
| 888 |
+
|
| 889 |
+
delay_details_html = f"""
|
| 890 |
{delay_warning}
|
| 891 |
<div class='delay-box'>
|
| 892 |
<div class='section-title'>⏱ Delay Clause Details</div>
|
| 893 |
+
{"".join(delay_html)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 894 |
</div>
|
| 895 |
"""
|
| 896 |
|
|
|
|
| 910 |
'risk_level': risk_level,
|
| 911 |
'record_id': record_id,
|
| 912 |
'penalty_examples': extracted_data,
|
| 913 |
+
'penalty_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in penalty_details.items()]),
|
| 914 |
'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
|
| 915 |
+
'obligation_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in obligation_details.items()]),
|
| 916 |
+
'delay_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in delay_details.items()])
|
| 917 |
}
|
| 918 |
|
| 919 |
try:
|
|
|
|
| 993 |
</div>
|
| 994 |
""",
|
| 995 |
"", # Empty string for hidden risk visualization
|
| 996 |
+
penalty_details_html,
|
| 997 |
f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
|
| 998 |
+
obligation_details_html,
|
| 999 |
+
delay_details_html,
|
| 1000 |
f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
|
| 1001 |
sentiment_analysis_output,
|
| 1002 |
temp_file_path # Return temporary file path for PDF download
|