Research-Copilot / backend /agents /tools /SAS_HAS_processor.py
Viraj0112's picture
Upload 42 files
88b06aa verified
import os
import re
from typing import List, Dict, Any, Optional, Tuple
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pydantic import BaseModel, Field
from datetime import datetime
from prompts.prompts_template import (SAS_PROCESSOR_SYSTEM_PROMPT,
HAS_PROCESSOR_L1_SYSTEM_PROMPT,
HAS_PROCESSOR_L2_SYSTEM_PROMPT,
HAS_PROCESSOR_L3_SYSTEM_PROMPT,
FINAL_SYNTHESIS_PROMPT)
from utils.llm_factory import LLMFactory
import time
class SectionMetadata(BaseModel):
"""Metadata for a detected section"""
section_type: str = Field(..., description="Type: abstract, introduction, methodology, results, etc.")
section_title: str = Field(..., description="Original section title from paper")
start_position: int = Field(..., description="Character position where section starts")
word_count: int = Field(..., description="Number of words in this section")
importance_score: float = Field(..., description="Importance score 0-1")
contains_figures: bool = Field(default=False, description="Whether section references figures")
contains_tables: bool = Field(default=False, description="Whether section references tables")
contains_equations: bool = Field(default=False, description="Whether section has equations")
class SectionSummary(BaseModel):
"""Section-Aware Summary (SAS) output"""
section_id: str = Field(..., description="Unique section identifier")
section_type: str = Field(..., description="Section type")
section_title: str = Field(..., description="Section title")
# Core summary
executive_summary: str = Field(..., description="1-2 sentence summary")
detailed_summary: str = Field(..., description="Comprehensive summary")
# Extracted elements
key_points: List[str] = Field(default=[], description="Key points (3-7 items)")
methodological_details: List[str] = Field(default=[], description="Methods, algorithms, approaches")
empirical_findings: List[str] = Field(default=[], description="Results, experiments, metrics")
technical_terms: List[str] = Field(default=[], description="Important technical terminology")
citations_mentioned: List[str] = Field(default=[], description="Papers/authors cited")
# Connections
related_sections: List[str] = Field(default=[], description="References to other sections")
# Quality metrics
information_density: float = Field(..., description="Information density 0-1")
novelty_score: float = Field(..., description="Novelty of content 0-1")
class HierarchicalLevel(BaseModel):
"""One level in the hierarchy"""
level: int = Field(..., description="Hierarchy level (1=lowest, 3=highest)")
summary: str = Field(..., description="Summary at this abstraction level")
key_contributions: List[str] = Field(default=[], description="Key contributions at this level")
# FIX: Made scope optional to prevent validation errors during final synthesis
scope: Optional[str] = Field(default=None, description="What this level covers")
class ComprehensivePaperAnalysis(BaseModel):
"""Final complete analysis combining SAS + HAS"""
# Paper identification
paper_title: str = Field(..., description="Paper title")
authors: List[str] = Field(default=[], description="Author names")
publication_info: str = Field(default="", description="Publication venue/date")
# Hierarchical summaries (HAS)
# FIX: Added default=[] to prevent validation errors. Logic manually populates this later.
hierarchical_summaries: List[HierarchicalLevel] = Field(default=[], description="Multi-level summaries")
# Section summaries (SAS)
# FIX: Added default={} to prevent validation errors. Logic manually populates this later.
section_summaries: Dict[str, str] = Field(default={}, description="Summary for each section")
# Comprehensive extraction
abstract_summary: str = Field(..., description="Abstract summary")
contributions: List[str] = Field(..., description="Main contributions")
methodology: Dict[str, Any] = Field(..., description="Methodology details")
datasets: List[str] = Field(default=[], description="Datasets used")
experiments: List[str] = Field(default=[], description="Experiments conducted")
results: Dict[str, Any] = Field(..., description="Key results")
limitations: List[str] = Field(default=[], description="Limitations")
future_work: List[str] = Field(default=[], description="Future research directions")
# Technical assessment
technical_depth: str = Field(..., description="Technical depth assessment")
novelty: str = Field(..., description="Novelty assessment")
domain_tags: List[str] = Field(..., description="Research domain tags")
# Resources
code_resources: Dict[str, Any] = Field(default={}, description="Code/data resources")
related_papers: List[str] = Field(default=[], description="Related work")
citations: List[str] = Field(default=[], description="Important citations")
# Metrics
relevance_score: float = Field(..., description="Overall relevance 0-1")
quality_score: float = Field(..., description="Paper quality 0-1")
# Metadata
total_sections: int = Field(..., description="Number of sections analyzed")
processing_timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
# ============================================================================
# SECTION-AWARE SUMMARIZATION (SAS) ENGINE
# ============================================================================
class SectionAwareSummarizer:
"""
Section-Aware Summarization (SAS)
Intelligently detects, classifies, and summarizes paper sections
with awareness of section type and importance.
"""
# Section type patterns and their importance weights
SECTION_PATTERNS = {
'abstract': {
'patterns': [r'\babstract\b', r'\bsummary\b'],
'importance': 1.0,
'required_elements': ['problem', 'approach', 'results']
},
'introduction': {
'patterns': [r'\bintroduction\b', r'\b1\.\s*introduction\b'],
'importance': 0.95,
'required_elements': ['motivation', 'problem', 'contributions']
},
'related_work': {
'patterns': [r'\brelated work\b', r'\bprior work\b', r'\bliterature review\b'],
'importance': 0.7,
'required_elements': ['prior_approaches', 'gaps']
},
'background': {
'patterns': [r'\bbackground\b', r'\bpreliminaries\b'],
'importance': 0.75,
'required_elements': ['concepts', 'definitions']
},
'methodology': {
'patterns': [r'\bmethodology\b', r'\bmethod\b', r'\bapproach\b', r'\bmodel\b', r'\barchitecture\b'],
'importance': 1.0,
'required_elements': ['approach', 'algorithm', 'implementation']
},
'experiments': {
'patterns': [r'\bexperiments\b', r'\bexperimental setup\b', r'\bevaluation\b'],
'importance': 0.95,
'required_elements': ['setup', 'datasets', 'metrics']
},
'results': {
'patterns': [r'\bresults\b', r'\bfindings\b', r'\bperformance\b'],
'importance': 1.0,
'required_elements': ['metrics', 'comparisons', 'analysis']
},
'discussion': {
'patterns': [r'\bdiscussion\b', r'\banalysis\b'],
'importance': 0.85,
'required_elements': ['interpretation', 'implications']
},
'conclusion': {
'patterns': [r'\bconclusion\b', r'\bconcluding remarks\b'],
'importance': 0.9,
'required_elements': ['summary', 'impact', 'future_work']
},
'limitations': {
'patterns': [r'\blimitations\b', r'\bweaknesses\b'],
'importance': 0.8,
'required_elements': ['constraints', 'weaknesses']
},
'future_work': {
'patterns': [r'\bfuture work\b', r'\bfuture directions\b'],
'importance': 0.75,
'required_elements': ['directions', 'extensions']
}
}
def __init__(self, llm):
self.llm = llm
def detect_sections(self, paper_content: str) -> List[Dict[str, Any]]:
"""
Detect and classify sections in the paper
"""
print("Detecting paper sections... 🔍")
sections = []
lines = paper_content.split('\n')
current_section = None
current_content = []
for i, line in enumerate(lines):
line_lower = line.lower().strip()
# Check if this is a section header
detected_type = None
for section_type, config in self.SECTION_PATTERNS.items():
for pattern in config['patterns']:
if re.search(pattern, line_lower) and len(line.strip()) < 100:
detected_type = section_type
break
if detected_type:
break
if detected_type:
# Save previous section
if current_section and current_content:
content_text = '\n'.join(current_content)
sections.append({
'type': current_section['type'],
'title': current_section['title'],
'content': content_text,
'start_position': current_section['start'],
'word_count': len(content_text.split()),
'importance': self.SECTION_PATTERNS[current_section['type']]['importance']
})
# Start new section
current_section = {
'type': detected_type,
'title': line.strip(),
'start': i
}
current_content = []
elif current_section:
current_content.append(line)
# Add last section
if current_section and current_content:
content_text = '\n'.join(current_content)
sections.append({
'type': current_section['type'],
'title': current_section['title'],
'content': content_text,
'start_position': current_section['start'],
'word_count': len(content_text.split()),
'importance': self.SECTION_PATTERNS[current_section['type']]['importance']
})
print(f" ✓ SAS: Detected {len(sections)} sections")
for sec in sections:
print(f" - {sec['type']}: {sec['word_count']} words (importance: {sec['importance']})")
return sections
def summarize_section(self, section: Dict[str, Any], max_retries: int = 3) -> SectionSummary:
"""
Create section-aware summary with type-specific extraction.
Includes retry logic to handle Groq tool_use_failed errors.
"""
section_type = section['type']
section_content = section['content']
# Type-specific prompts
type_specific_instructions = {
'abstract': "Extract the problem, approach, key results, and contributions.",
'introduction': "Extract motivation, problem statement, main contributions, and paper organization.",
'methodology': "Extract the approach, algorithms, model architecture, and implementation details.",
'experiments': "Extract experimental setup, datasets, baselines, evaluation metrics, and protocols.",
'results': "Extract performance metrics, comparisons with baselines, ablation studies, and key findings.",
'discussion': "Extract interpretation of results, implications, and insights.",
'conclusion': "Extract main takeaways, impact, and future directions.",
'related_work': "Extract prior approaches, their limitations, and how this work differs."
}
instruction = type_specific_instructions.get(
section_type,
"Extract key points, technical details, and important findings."
)
required_elements_list = self.SECTION_PATTERNS.get(section_type, {}).get('required_elements', [])
required_elements_str = ', '.join(required_elements_list)
prompt = ChatPromptTemplate.from_messages([
("system", SAS_PROCESSOR_SYSTEM_PROMPT),
("user", """Section Title: {title}
Section Type: {type}
Content:
{content}
Provide comprehensive section-aware summary.
""")])
structured_llm = self.llm.with_structured_output(SectionSummary)
chain = prompt | structured_llm
# Retry logic to handle Groq tool_use_failed errors
last_error = None
for attempt in range(max_retries):
try:
# Progressively truncate content on each retry to reduce complexity
max_chars = 30000 - (attempt * 5000) # Increased from 8000
truncated_content = section_content[:max_chars]
summary = chain.invoke({
'title': section['title'],
'type': section_type,
'content': truncated_content,
'section_type': section_type.upper(),
'required_elements': required_elements_str,
'instruction': instruction
})
return summary
except Exception as e:
last_error = e
error_str = str(e).lower()
# Check for Groq tool_use_failed or similar errors
if 'tool_use_failed' in error_str or 'failed to call a function' in error_str or '400' in str(e):
print(f" ⚠️ Retry {attempt + 1}/{max_retries}: Groq tool_use error, reducing content size...")
time.sleep(2) # Brief pause before retry
continue
else:
# For other errors, raise immediately
raise
# If all retries failed, create a fallback minimal summary
print(f" ⚠️ All retries failed for {section['title']}, creating fallback summary...")
return SectionSummary(
section_id=section_type.upper(),
section_type=section_type,
section_title=section['title'],
executive_summary=f"Summary of {section_type} section (processing error occurred).",
detailed_summary=f"The {section_type} section could not be fully processed due to API limitations. Original content length: {len(section_content)} chars.",
key_points=[f"Section type: {section_type}"],
methodological_details=[],
empirical_findings=[],
technical_terms=[],
citations_mentioned=[],
related_sections=[],
information_density=0.5,
novelty_score=0.5
)
def _group_sections_globally(self, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Group ALL sections of the same type together, regardless of position.
This ensures we only make ~1 API call per section type (e.g. one for all Methodology).
"""
if not sections:
return []
# Dictionary to hold merged sections by type
merged_map = {}
# Order to preserve the first appearance of each type
type_order = []
for section in sections:
s_type = section['type']
if s_type not in merged_map:
merged_map[s_type] = section.copy()
merged_map[s_type]['title'] = f"{s_type.capitalize()} (Merged)"
type_order.append(s_type)
else:
# Merge content
merged_map[s_type]['content'] += "\n\n" + section['content']
merged_map[s_type]['word_count'] += section['word_count']
# Return list in order of first appearance
return [merged_map[t] for t in type_order]
def process_all_sections(self, sections: List[Dict[str, Any]]) -> List[SectionSummary]:
"""
Process all sections with section-aware summarization
"""
print(" 📝 SAS: Generating section-aware summaries...")
# Group ALL sections of the same type globally
grouped_sections = self._group_sections_globally(sections)
print(f" 📝 SAS: Globally grouped {len(sections)} sections into {len(grouped_sections)} unique types")
summaries = []
for i, section in enumerate(grouped_sections):
print(f" Processing group {i+1}/{len(grouped_sections)}: {section['type']} ({section['word_count']} words)")
# Handle long sections by chunking (reduced threshold to avoid Groq errors)
if section['word_count'] > 1500:
print(f" (Long section with {section['word_count']} words, chunking...)")
chunk_summaries = self._chunk_and_summarize(section)
summaries.extend(chunk_summaries)
else:
summary = self.summarize_section(section)
summaries.append(summary)
# Rate limiting sleep
time.sleep(10)
print(f" ✓ SAS: Generated {len(summaries)} section summaries")
return summaries
def _chunk_and_summarize(self, section: Dict[str, Any]) -> List[SectionSummary]:
"""
Chunk long sections and summarize each chunk
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=10000,
chunk_overlap=500,
separators=["\n\n", "\n", ". ", " "]
)
chunks = splitter.split_text(section['content'])
chunk_summaries = []
for i, chunk in enumerate(chunks):
chunk_section = {
'type': section['type'],
'title': f"{section['title']} (Part {i+1})",
'content': chunk,
'start_position': section['start_position'],
'word_count': len(chunk.split()),
'importance': section['importance']
}
summary = self.summarize_section(chunk_section)
chunk_summaries.append(summary)
return chunk_summaries
# ============================================================================
# HIERARCHICAL SUMMARIZATION (HAS) ENGINE
# ============================================================================
class HierarchicalSummarizer:
"""
Hierarchical Summarization (HAS)
Creates multi-level abstractions:
- Level 1: Detailed (section-level insights)
- Level 2: Intermediate (cross-section synthesis)
- Level 3: Executive (high-level overview)
"""
def __init__(self, llm):
self.llm = llm
def create_level1_summary(
self,
section_summaries: List[SectionSummary]
) -> HierarchicalLevel:
"""
Level 1: Detailed summary from section summaries
"""
print(" 📊 HAS Level 1: Creating detailed summary...")
# Aggregate all section content
all_key_points = []
all_findings = []
for summary in section_summaries:
all_key_points.extend(summary.key_points)
all_findings.extend(summary.empirical_findings)
# Create detailed synthesis
sections_text = "\n\n".join([
f"**{s.section_type.upper()}**: {s.detailed_summary}"
for s in section_summaries
])
prompt = ChatPromptTemplate.from_messages([
("system", HAS_PROCESSOR_L1_SYSTEM_PROMPT),
("user", """Section Summaries:
{sections}
All Key Points: {key_points}
Create detailed Level 1 summary that preserves technical depth.""")])
response = self.llm.invoke(prompt.format_messages(
sections=sections_text[:50000], # Increased from 8000
key_points=str(all_key_points[:100]) # Increased from 50
))
level1 = HierarchicalLevel(
level=1,
summary=response.content,
key_contributions=all_key_points[:50], # Increased from 20
scope="Detailed section-level analysis with technical specifics"
)
print(" ✓ HAS Level 1 complete")
return level1
def create_level2_summary(
self,
level1: HierarchicalLevel,
section_summaries: List[SectionSummary]
) -> HierarchicalLevel:
"""
Level 2: Intermediate synthesis across sections
"""
print(" 📊 HAS Level 2: Creating intermediate synthesis...")
# Group sections by type for cross-section analysis
methodology_sections = [s for s in section_summaries if s.section_type in ['methodology', 'approach', 'model']]
results_sections = [s for s in section_summaries if s.section_type in ['results', 'experiments']]
prompt = ChatPromptTemplate.from_messages([
("system", HAS_PROCESSOR_L2_SYSTEM_PROMPT),
("user", """Level 1 (Detailed):
{level1}
Methodology Insights: {methodology}
Results Insights: {results}
Create intermediate Level 2 summary focusing on main contributions and their validation.""")
])
methodology_text = " | ".join([s.executive_summary for s in methodology_sections])
results_text = " | ".join([s.executive_summary for s in results_sections])
response = self.llm.invoke(prompt.format_messages(
level1=level1.summary[:20000], # Increased from 3000
methodology=methodology_text[:10000], # Increased from 1000
results=results_text[:10000] # Increased from 1000
))
level2 = HierarchicalLevel(
level=2,
summary=response.content,
key_contributions=level1.key_contributions[:20], # Increased from 10
scope="Cross-section synthesis of contributions and findings"
)
print(" ✓ HAS Level 2 complete")
return level2
def create_level3_summary(
self,
level2: HierarchicalLevel,
section_summaries: List[SectionSummary]
) -> HierarchicalLevel:
"""
Level 3: Executive summary (highest abstraction)
"""
print(" 📊 HAS Level 3: Creating executive summary...")
# Extract only the most critical information
abstract_summary = next(
(s.executive_summary for s in section_summaries if s.section_type == 'abstract'),
"No abstract found"
)
contributions = next(
(s.key_points for s in section_summaries if s.section_type == 'introduction'),
[]
)
prompt = ChatPromptTemplate.from_messages([
("system", HAS_PROCESSOR_L3_SYSTEM_PROMPT),
("user", """Abstract: {abstract}
Level 2 (Intermediate):
{level2}
Main Contributions: {contributions}
Create concise executive summary answering:
1. What problem does this solve?
2. What's the proposed solution?
3. What are the key results?
4. Why does it matter?
""")])
response = self.llm.invoke(prompt.format_messages(
abstract=abstract_summary,
level2=level2.summary[:20000], # Increased from 2000
contributions=str(contributions[:20]) # Increased from 5
))
level3 = HierarchicalLevel(
level=3,
summary=response.content,
key_contributions=level2.key_contributions[:10], # Increased from 5
scope="Executive overview for quick understanding"
)
print(" ✓ HAS Level 3 complete")
return level3
def create_hierarchy(
self,
section_summaries: List[SectionSummary]
) -> List[HierarchicalLevel]:
"""
Create complete 3-level hierarchy
"""
print("\n 🏗️ HAS: Building hierarchical summaries...")
level1 = self.create_level1_summary(section_summaries)
level2 = self.create_level2_summary(level1, section_summaries)
level3 = self.create_level3_summary(level2, section_summaries)
print(" ✓ HAS: Complete 3-level hierarchy created")
return [level1, level2, level3]
# ============================================================================
# INTEGRATED SAS + HAS PROCESSOR
# ============================================================================
class SASHASProcessor:
"""
Integrated Section-Aware + Hierarchical Summarization
Complete pipeline:
1. Section detection and classification
2. Section-aware summarization (SAS)
3. Hierarchical synthesis (HAS)
4. Final comprehensive analysis
"""
def __init__(self, llm_config: Dict = None):
self.llm = LLMFactory.get_llm(
agent="paper_analysis",
temperature=0.1,
max_retries=5,
llm_config=llm_config
)
self.sas = SectionAwareSummarizer(self.llm)
self.has = HierarchicalSummarizer(self.llm)
def process_paper(self, paper_content: str) -> ComprehensivePaperAnalysis:
"""
Complete SAS + HAS processing pipeline
"""
print("\n" + "="*70)
print("🚀 SAS + HAS PAPER ANALYSIS PIPELINE")
print("="*70)
print(f"Paper length: {len(paper_content):,} characters")
print(f"Estimated words: {len(paper_content.split()):,}")
# ====================================================================
# PHASE 1: Section-Aware Summarization (SAS)
# ====================================================================
print("\n📋 PHASE 1: SECTION-AWARE SUMMARIZATION (SAS)")
print("-" * 70)
# Detect sections
sections = self.sas.detect_sections(paper_content)
# Summarize each section
section_summaries = self.sas.process_all_sections(sections)
# ====================================================================
# PHASE 2: Hierarchical Summarization (HAS)
# ====================================================================
print("\n🏗️ PHASE 2: HIERARCHICAL SUMMARIZATION (HAS)")
print("-" * 70)
hierarchy = self.has.create_hierarchy(section_summaries)
# ====================================================================
# PHASE 3: Final Synthesis
# ====================================================================
print("\n🔬 PHASE 3: FINAL SYNTHESIS")
print("-" * 70)
final_analysis = self._synthesize_final_analysis(
paper_content,
sections,
section_summaries,
hierarchy
)
print("\n" + "="*70)
print("✅ SAS + HAS ANALYSIS COMPLETE")
print("="*70)
print(f"Total sections analyzed: {len(sections)}")
print(f"Hierarchical levels: {len(hierarchy)}")
print(f"Total contributions identified: {len(final_analysis.contributions)}")
return final_analysis
def _synthesize_final_analysis(
self,
paper_content: str,
sections: List[Dict],
section_summaries: List[SectionSummary],
hierarchy: List[HierarchicalLevel],
max_retries: int = 3
) -> ComprehensivePaperAnalysis:
"""
Synthesize everything into final structured output.
Includes retry logic to handle Groq tool_use_failed errors.
"""
print(" 🔄 Synthesizing final comprehensive analysis...")
# Extract paper metadata
first_page = paper_content[:2000]
# Aggregate data from sections
all_contributions = []
all_methodologies = []
all_results = []
all_datasets = []
all_limitations = []
all_citations = []
for summary in section_summaries:
all_contributions.extend(summary.key_points)
all_methodologies.extend(summary.methodological_details)
all_results.extend(summary.empirical_findings)
all_citations.extend(summary.citations_mentioned)
# Build section summaries dict
section_summaries_dict = {
s.section_title: s.detailed_summary
for s in section_summaries
}
# Create final synthesis prompt
prompt = ChatPromptTemplate.from_messages([
("system", FINAL_SYNTHESIS_PROMPT),
("user", """First Page (for metadata):
{first_page}
HIERARCHICAL SUMMARIES:
Level 3 (Executive): {level3}
Level 2 (Intermediate): {level2}
Level 1 (Detailed): {level1}
SECTION DATA:
Contributions: {contributions}
Methodologies: {methodologies}
Results: {results}
Citations: {citations}
Create comprehensive structured analysis.
""")])
structured_llm = self.llm.with_structured_output(ComprehensivePaperAnalysis)
chain = prompt | structured_llm
# Retry logic to handle Groq tool_use_failed errors
last_error = None
for attempt in range(max_retries):
try:
# Progressively reduce content on each retry
first_page_len = 10000 - (attempt * 2000) # Increased from 2000
level1_len = 20000 - (attempt * 4000) # Increased from 2000
contrib_count = 50 - (attempt * 10) # Increased from 20
method_count = 30 - (attempt * 5) # Increased from 15
result_count = 30 - (attempt * 5) # Increased from 15
citation_count = 50 - (attempt * 10) # Increased from 20
final_analysis = chain.invoke({
"first_page": first_page[:first_page_len],
"level3": hierarchy[2].summary[:10000], # Increased from 1500
"level2": hierarchy[1].summary[:15000], # Increased from 1500
"level1": hierarchy[0].summary[:level1_len],
"contributions": str(all_contributions[:contrib_count]),
"methodologies": str(all_methodologies[:method_count]),
"results": str(all_results[:result_count]),
"citations": str(all_citations[:citation_count])
})
# Enrich with hierarchical summaries and section summaries
# This overwrites any empty/hallucinated lists from the LLM with the valid ones from previous phases
final_analysis.hierarchical_summaries = hierarchy
final_analysis.section_summaries = section_summaries_dict
final_analysis.total_sections = len(sections)
print(" ✓ Final synthesis complete")
return final_analysis
except Exception as e:
last_error = e
error_str = str(e).lower()
# Check for Groq tool_use_failed or similar errors
if 'tool_use_failed' in error_str or 'failed to call a function' in error_str or '400' in str(e):
print(f" ⚠️ Retry {attempt + 1}/{max_retries}: Groq tool_use error in final synthesis, reducing content...")
time.sleep(2)
continue
else:
# For other errors, raise immediately
raise
# If all retries failed, create a fallback analysis from collected data
print(f" ⚠️ All retries failed for final synthesis, creating fallback analysis...")
# Extract title from first page
title_lines = first_page.split('\n')[:5]
paper_title = title_lines[0] if title_lines else "Unknown Paper"
fallback_analysis = ComprehensivePaperAnalysis(
paper_title=paper_title,
authors=[],
publication_info="",
hierarchical_summaries=hierarchy,
section_summaries=section_summaries_dict,
abstract_summary=hierarchy[2].summary if len(hierarchy) > 2 else "Analysis completed with partial data.",
contributions=all_contributions[:20], # Increased from 10
methodology={"approach": ", ".join(all_methodologies[:10])} if all_methodologies else {}, # Increased from 5
datasets=[],
experiments=[],
results={"findings": ", ".join(all_results[:10])} if all_results else {}, # Increased from 5
limitations=[],
future_work=[],
technical_depth="Moderate (fallback analysis)",
novelty="See hierarchical summaries for details",
domain_tags=["Research Paper"],
code_resources={},
related_papers=[],
citations=all_citations[:20], # Increased from 10
relevance_score=0.7,
quality_score=0.7,
total_sections=len(sections)
)
print(" ✓ Fallback synthesis complete")
return fallback_analysis