Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12, 2025

Commit

054584c

verified ·

1 Parent(s): bb88c28

Update app.py

Browse files

Files changed (1) hide show

app.py +208 -81

app.py CHANGED Viewed

@@ -101,6 +101,69 @@ def process_excel(uploaded_file):
         st.error(f"Error processing file: {str(e)}")
         return None
 def preprocess_text(text):
     """Preprocess text to add appropriate formatting before summarization"""
     if not isinstance(text, str) or not text.strip():
@@ -109,8 +172,8 @@ def preprocess_text(text):
     # Split text into sentences (basic implementation)
     sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n')]
-    # Remove empty sentences
-    sentences = [s for s in sentences if s]
     # Join with proper line breaks
     formatted_text = '\n'.join(sentences)
@@ -118,115 +181,179 @@ def preprocess_text(text):
     return formatted_text
 def post_process_summary(summary):
-    """Clean up and improve summary coherence."""
     if not summary:
         return summary
-    # Split into sentences
-    sentences = [s.strip() for s in summary.split('.')]
-    sentences = [s for s in sentences if s]  # Remove empty sentences
-    # Correct common issues
-    processed_sentences = []
-    for sentence in sentences:
-        # Remove redundant phrases
-        sentence = re.sub(r"\b(and and|appointment and appointment)\b", "and", sentence)
-        # Ensure first letter capitalization
-        sentence = sentence.capitalize()
-        # Avoid duplicates
-        if sentence not in processed_sentences:
-            processed_sentences.append(sentence)
-    # Join sentences with proper punctuation
-    cleaned_summary = '. '.join(processed_sentences)
-    return cleaned_summary if cleaned_summary.endswith('.') else cleaned_summary + '.'
 def improve_summary_generation(text, model, tokenizer):
-    """Generate improved summary with better prompt and validation."""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    # Add a structured prompt for summarization
     formatted_text = (
-        "Summarize this biomedical research abstract into the following structure:\n"
-        "1. Background and Objectives\n"
-        "2. Methods\n"
-        "3. Key Findings (include any percentages or numbers)\n"
-        "4. Conclusions\n"
-        f"Abstract:\n{text.strip()}"
     )
-    # Prepare input tokens
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Generate summary with adjusted parameters
-    try:
         with torch.no_grad():
-            summary_ids = model.generate(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                max_length=300,  # Increased for more detailed summaries
-                min_length=100,  # Ensure summaries are not too short
-                num_beams=5,
-                length_penalty=1.5,
-                no_repeat_ngram_size=3,
-                temperature=0.7,
-                repetition_penalty=1.3,
             )
-        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    except Exception as e:
-        return f"Error in generation: {str(e)}"
-    # Post-process the summary
-    return post_process_summary(summary)
-    # Validate the summary
-    if not validate_summary(processed_summary, text):
-    # Retry with alternate generation parameters
-        with torch.no_grad():
-            summary_ids = model.generate(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                max_length=250,
-                min_length=50,
-                num_beams=4,
-                length_penalty=2.0,
-                no_repeat_ngram_size=4,
-                temperature=0.8,
-                repetition_penalty=1.5,
-            )
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
         processed_summary = post_process_summary(summary)
-    return processed_summary
 def validate_summary(summary, original_text):
-    """Validate summary content against original text."""
-    # Check for common validation points
-    if not summary or len(summary.split()) < 20:
-        return False  # Too short
-    if len(summary.split()) > len(original_text.split()) * 0.8:
-        return False  # Too long
-    # Ensure structure is maintained (e.g., headings are present)
-    required_sections = ["background and objectives", "methods", "key findings", "conclusions"]
-    if not all(section.lower() in summary.lower() for section in required_sections):
         return False
-    # Ensure no repetitive sentences
     sentences = summary.split('.')
-    if len(sentences) != len(set(sentences)):
         return False
     return True
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract

         st.error(f"Error processing file: {str(e)}")
         return None
+def verify_facts(summary, original_text):
+    """Verify that key facts in the summary match the original text"""
+    # Extract numbers and percentages
+    def extract_numbers(text):
+        return set(re.findall(r'(\d+\.?\d*)%?', text))
+    original_numbers = extract_numbers(original_text)
+    summary_numbers = extract_numbers(summary)
+    # Check if all numbers from original are in summary
+    missing_numbers = original_numbers - summary_numbers
+    # Extract key phrases indicating relationships
+    relationship_patterns = [
+        r'associated with',
+        r'predicted',
+        r'correlated with',
+        r'relationship between',
+        r'linked to'
+    ]
+    def extract_relationships(text):
+        relationships = []
+        for pattern in relationship_patterns:
+            matches = re.finditer(pattern, text.lower())
+            for match in matches:
+                # Get surrounding context
+                start = max(0, match.start() - 50)
+                end = min(len(text), match.end() + 50)
+                relationships.append(text[start:end].strip())
+        return set(relationships)
+    original_relationships = extract_relationships(original_text)
+    summary_relationships = extract_relationships(summary)
+    # Check for contradictions
+    def find_contradictions(summary, original):
+        contradictions = []
+        # Common contradiction patterns
+        neg_patterns = [
+            (r'no association', r'associated with'),
+            (r'did not predict', r'predicted'),
+            (r'was not significant', r'was significant'),
+            (r'decreased', r'increased'),
+            (r'lower', r'higher')
+        ]
+        for pos, neg in neg_patterns:
+            if (re.search(pos, summary.lower()) and re.search(neg, original.lower())) or \
+               (re.search(neg, summary.lower()) and re.search(pos, original.lower())):
+                contradictions.append(f"Contradiction found: {pos} vs {neg}")
+        return contradictions
+    contradictions = find_contradictions(summary, original_text)
+    return {
+        'missing_numbers': missing_numbers,
+        'missing_relationships': original_relationships - summary_relationships,
+        'contradictions': contradictions,
+        'is_valid': len(missing_numbers) == 0 and len(contradictions) == 0
+    }
 def preprocess_text(text):
     """Preprocess text to add appropriate formatting before summarization"""
     if not isinstance(text, str) or not text.strip():
     # Split text into sentences (basic implementation)
     sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n')]
+    # Remove empty sentences and extra whitespace
+    sentences = [re.sub(r'\s+', ' ', s).strip() for s in sentences if s.strip()]
     # Join with proper line breaks
     formatted_text = '\n'.join(sentences)
     return formatted_text
 def post_process_summary(summary):
+    """Enhanced post-processing for better structure and completeness"""
     if not summary:
         return summary
+    # Split into sections
+    sections = summary.split('\n')
+    processed_sections = []
+    for section in sections:
+        if not section.strip():
+            continue
+        # Remove redundant section headers
+        section = re.sub(r'^(Background and objectives|Methods|Results|Conclusions):\s*', '', section)
+        # Split into sentences
+        sentences = [s.strip() for s in section.split('.')]
+        sentences = [s for s in sentences if s]
+        processed_sentences = []
+        for i, sentence in enumerate(sentences):
+            # Fix common issues
+            sentence = re.sub(r'\s+', ' ', sentence)  # Fix spacing
+            sentence = re.sub(r'(\d+)\s*%', r'\1%', sentence)  # Fix percentage formatting
+            sentence = re.sub(r'\(\s*([Nn])\s*=\s*(\d+)\s*\)', r'(n=\2)', sentence)  # Fix sample size formatting
+            # Fix common phrase issues
+            sentence = sentence.replace(" and and ", " and ")
+            sentence = sentence.replace("appointment and appointment", "appointment")
+            sentence = sentence.replace("Cancers distress", "Cancer distress")
+            # Remove redundant phrases
+            sentence = re.sub(r'(?i)the aim of (the|this) study was to', '', sentence)
+            sentence = re.sub(r'(?i)this study aimed to', '', sentence)
+            # Capitalize first letter
+            sentence = sentence.capitalize()
+            if sentence.strip():
+                processed_sentences.append(sentence)
+        if processed_sentences:
+            section = '. '.join(processed_sentences)
+            if not section.endswith('.'):
+                section += '.'
+            processed_sections.append(section)
+    # Ensure key sections are present
+    required_sections = ['Background and objectives', 'Methods', 'Key findings', 'Conclusions']
+    final_sections = []
+    for i, section in enumerate(processed_sections):
+        if i < len(required_sections):
+            final_sections.append(f"{required_sections[i]}: {section}")
+        else:
+            final_sections.append(section)
+    return '\n\n'.join(final_sections)
 def improve_summary_generation(text, model, tokenizer):
+    """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
+    # Add a more specific prompt with strict guidelines
     formatted_text = (
+        "Generate a precise summary of this medical research paper following these strict guidelines:\n"
+        "1. Background and objectives: State ONLY the actual study purpose and population - no assumptions\n"
+        "2. Methods: Include ONLY methods explicitly mentioned in the text\n"
+        "3. Key findings: Report ALL numerical results and statistical relationships\n"
+        "4. Conclusions: State ONLY conclusions directly supported by the reported results\n\n"
+        "Requirements:\n"
+        "- Include ALL percentages and numbers from the original text\n"
+        "- Do not repeat section headers\n"
+        "- Do not make claims beyond what's explicitly stated\n"
+        "- Maintain the original meaning without contradiction\n"
+        "- Do not introduce new information\n\n"
+        "Original text: " + preprocess_text(text)
     )
+    # Tokenize input
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    def generate_attempt(temperature, num_beams, length_penalty):
         with torch.no_grad():
+            return model.generate(
+                **{
+                    "input_ids": inputs["input_ids"],
+                    "attention_mask": inputs["attention_mask"],
+                    "max_length": 300,  # Increased to ensure all facts are included
+                    "min_length": 100,  # Increased to encourage more complete summaries
+                    "num_beams": num_beams,
+                    "length_penalty": length_penalty,
+                    "no_repeat_ngram_size": 3,
+                    "temperature": temperature,
+                    "repetition_penalty": 2.0,  # Increased to reduce repetition
+                    "do_sample": True  # Enable sampling for more diverse outputs
+                }
             )
+    # Try different parameter combinations until we get a valid summary
+    parameter_combinations = [
+        {"temperature": 0.7, "num_beams": 5, "length_penalty": 1.5},
+        {"temperature": 0.5, "num_beams": 8, "length_penalty": 2.0},
+        {"temperature": 0.3, "num_beams": 10, "length_penalty": 2.5}
+    ]
+    best_summary = None
+    best_verification = None
+    for params in parameter_combinations:
+        summary_ids = generate_attempt(**params)
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
         processed_summary = post_process_summary(summary)
+        # Verify facts in the summary
+        verification = verify_facts(processed_summary, text)
+        if verification['is_valid']:
+            return processed_summary
+        # Keep track of best attempt
+        if best_verification is None or \
+           len(verification['missing_numbers']) < len(best_verification['missing_numbers']):
+            best_summary = processed_summary
+            best_verification = verification
+    # If no perfect summary was generated, use the best attempt
+    # Add missing information if necessary
+    if best_verification and best_verification['missing_numbers']:
+        # Attempt to add missing numerical information
+        additional_info = []
+        original_sentences = text.split('.')
+        for num in best_verification['missing_numbers']:
+            # Find sentences containing the missing number
+            for sentence in original_sentences:
+                if str(num) in sentence:
+                    additional_info.append(sentence.strip())
+                    break
+        if additional_info:
+            best_summary += "\n\nAdditional key findings: " + ". ".join(additional_info) + "."
+    return best_summary
 def validate_summary(summary, original_text):
+    """Validate summary content against original text"""
+    # Perform fact verification
+    verification = verify_facts(summary, original_text)
+    if not verification['is_valid']:
         return False
+    # Check for age inconsistencies
+    age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
+    if len(age_mentions) > 1:  # Multiple age mentions
+        return False
+    # Check for repetitive sentences
     sentences = summary.split('.')
+    unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
+    if len(sentences) - len(unique_sentences) > 1:  # More than one duplicate
         return False
+    # Check summary isn't too long or too short compared to original
+    summary_words = len(summary.split())
+    original_words = len(original_text.split())
+    if summary_words < 20 or summary_words > original_words * 0.8:
+        return False
     return True
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract