File size: 34,702 Bytes
88b06aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
import os
import re
from typing import List, Dict, Any, Optional, Tuple
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pydantic import BaseModel, Field
from datetime import datetime
from prompts.prompts_template import (SAS_PROCESSOR_SYSTEM_PROMPT, 
                        HAS_PROCESSOR_L1_SYSTEM_PROMPT, 
                        HAS_PROCESSOR_L2_SYSTEM_PROMPT, 
                        HAS_PROCESSOR_L3_SYSTEM_PROMPT, 
                        FINAL_SYNTHESIS_PROMPT)
from utils.llm_factory import LLMFactory
import time

class SectionMetadata(BaseModel):
    """Metadata for a detected section"""
    section_type: str = Field(..., description="Type: abstract, introduction, methodology, results, etc.")
    section_title: str = Field(..., description="Original section title from paper")
    start_position: int = Field(..., description="Character position where section starts")
    word_count: int = Field(..., description="Number of words in this section")
    importance_score: float = Field(..., description="Importance score 0-1")
    contains_figures: bool = Field(default=False, description="Whether section references figures")
    contains_tables: bool = Field(default=False, description="Whether section references tables")
    contains_equations: bool = Field(default=False, description="Whether section has equations")

class SectionSummary(BaseModel):
    """Section-Aware Summary (SAS) output"""
    section_id: str = Field(..., description="Unique section identifier")
    section_type: str = Field(..., description="Section type")
    section_title: str = Field(..., description="Section title")
    
    # Core summary
    executive_summary: str = Field(..., description="1-2 sentence summary")
    detailed_summary: str = Field(..., description="Comprehensive summary")
    
    # Extracted elements
    key_points: List[str] = Field(default=[], description="Key points (3-7 items)")
    methodological_details: List[str] = Field(default=[], description="Methods, algorithms, approaches")
    empirical_findings: List[str] = Field(default=[], description="Results, experiments, metrics")
    technical_terms: List[str] = Field(default=[], description="Important technical terminology")
    citations_mentioned: List[str] = Field(default=[], description="Papers/authors cited")
    
    # Connections
    related_sections: List[str] = Field(default=[], description="References to other sections")
    
    # Quality metrics
    information_density: float = Field(..., description="Information density 0-1")
    novelty_score: float = Field(..., description="Novelty of content 0-1")

class HierarchicalLevel(BaseModel):
    """One level in the hierarchy"""
    level: int = Field(..., description="Hierarchy level (1=lowest, 3=highest)")
    summary: str = Field(..., description="Summary at this abstraction level")
    key_contributions: List[str] = Field(default=[], description="Key contributions at this level")
    # FIX: Made scope optional to prevent validation errors during final synthesis
    scope: Optional[str] = Field(default=None, description="What this level covers")

class ComprehensivePaperAnalysis(BaseModel):
    """Final complete analysis combining SAS + HAS"""
    
    # Paper identification
    paper_title: str = Field(..., description="Paper title")
    authors: List[str] = Field(default=[], description="Author names")
    publication_info: str = Field(default="", description="Publication venue/date")
    
    # Hierarchical summaries (HAS)
    # FIX: Added default=[] to prevent validation errors. Logic manually populates this later.
    hierarchical_summaries: List[HierarchicalLevel] = Field(default=[], description="Multi-level summaries")
    
    # Section summaries (SAS)
    # FIX: Added default={} to prevent validation errors. Logic manually populates this later.
    section_summaries: Dict[str, str] = Field(default={}, description="Summary for each section")
    
    # Comprehensive extraction
    abstract_summary: str = Field(..., description="Abstract summary")
    contributions: List[str] = Field(..., description="Main contributions")
    methodology: Dict[str, Any] = Field(..., description="Methodology details")
    datasets: List[str] = Field(default=[], description="Datasets used")
    experiments: List[str] = Field(default=[], description="Experiments conducted")
    results: Dict[str, Any] = Field(..., description="Key results")
    limitations: List[str] = Field(default=[], description="Limitations")
    future_work: List[str] = Field(default=[], description="Future research directions")
    
    # Technical assessment
    technical_depth: str = Field(..., description="Technical depth assessment")
    novelty: str = Field(..., description="Novelty assessment")
    domain_tags: List[str] = Field(..., description="Research domain tags")
    
    # Resources
    code_resources: Dict[str, Any] = Field(default={}, description="Code/data resources")
    related_papers: List[str] = Field(default=[], description="Related work")
    citations: List[str] = Field(default=[], description="Important citations")
    
    # Metrics
    relevance_score: float = Field(..., description="Overall relevance 0-1")
    quality_score: float = Field(..., description="Paper quality 0-1")
    
    # Metadata
    total_sections: int = Field(..., description="Number of sections analyzed")
    processing_timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())


# ============================================================================
# SECTION-AWARE SUMMARIZATION (SAS) ENGINE
# ============================================================================

class SectionAwareSummarizer:
    """
    Section-Aware Summarization (SAS)
    
    Intelligently detects, classifies, and summarizes paper sections
    with awareness of section type and importance.
    """
    
    # Section type patterns and their importance weights
    SECTION_PATTERNS = {
        'abstract': {
            'patterns': [r'\babstract\b', r'\bsummary\b'],
            'importance': 1.0,
            'required_elements': ['problem', 'approach', 'results']
        },
        'introduction': {
            'patterns': [r'\bintroduction\b', r'\b1\.\s*introduction\b'],
            'importance': 0.95,
            'required_elements': ['motivation', 'problem', 'contributions']
        },
        'related_work': {
            'patterns': [r'\brelated work\b', r'\bprior work\b', r'\bliterature review\b'],
            'importance': 0.7,
            'required_elements': ['prior_approaches', 'gaps']
        },
        'background': {
            'patterns': [r'\bbackground\b', r'\bpreliminaries\b'],
            'importance': 0.75,
            'required_elements': ['concepts', 'definitions']
        },
        'methodology': {
            'patterns': [r'\bmethodology\b', r'\bmethod\b', r'\bapproach\b', r'\bmodel\b', r'\barchitecture\b'],
            'importance': 1.0,
            'required_elements': ['approach', 'algorithm', 'implementation']
        },
        'experiments': {
            'patterns': [r'\bexperiments\b', r'\bexperimental setup\b', r'\bevaluation\b'],
            'importance': 0.95,
            'required_elements': ['setup', 'datasets', 'metrics']
        },
        'results': {
            'patterns': [r'\bresults\b', r'\bfindings\b', r'\bperformance\b'],
            'importance': 1.0,
            'required_elements': ['metrics', 'comparisons', 'analysis']
        },
        'discussion': {
            'patterns': [r'\bdiscussion\b', r'\banalysis\b'],
            'importance': 0.85,
            'required_elements': ['interpretation', 'implications']
        },
        'conclusion': {
            'patterns': [r'\bconclusion\b', r'\bconcluding remarks\b'],
            'importance': 0.9,
            'required_elements': ['summary', 'impact', 'future_work']
        },
        'limitations': {
            'patterns': [r'\blimitations\b', r'\bweaknesses\b'],
            'importance': 0.8,
            'required_elements': ['constraints', 'weaknesses']
        },
        'future_work': {
            'patterns': [r'\bfuture work\b', r'\bfuture directions\b'],
            'importance': 0.75,
            'required_elements': ['directions', 'extensions']
        }
    }
    
    def __init__(self, llm):
        self.llm = llm
    
    def detect_sections(self, paper_content: str) -> List[Dict[str, Any]]:
        """
        Detect and classify sections in the paper
        """
        print("Detecting paper sections... πŸ”")
        
        sections = []
        lines = paper_content.split('\n')
        current_section = None
        current_content = []
        
        for i, line in enumerate(lines):
            line_lower = line.lower().strip()
            
            # Check if this is a section header
            detected_type = None
            for section_type, config in self.SECTION_PATTERNS.items():
                for pattern in config['patterns']:
                    if re.search(pattern, line_lower) and len(line.strip()) < 100:
                        detected_type = section_type
                        break
                if detected_type:
                    break
            
            if detected_type:
                # Save previous section
                if current_section and current_content:
                    content_text = '\n'.join(current_content)
                    sections.append({
                        'type': current_section['type'],
                        'title': current_section['title'],
                        'content': content_text,
                        'start_position': current_section['start'],
                        'word_count': len(content_text.split()),
                        'importance': self.SECTION_PATTERNS[current_section['type']]['importance']
                    })
                
                # Start new section
                current_section = {
                    'type': detected_type,
                    'title': line.strip(),
                    'start': i
                }
                current_content = []
            elif current_section:
                current_content.append(line)
        
        # Add last section
        if current_section and current_content:
            content_text = '\n'.join(current_content)
            sections.append({
                'type': current_section['type'],
                'title': current_section['title'],
                'content': content_text,
                'start_position': current_section['start'],
                'word_count': len(content_text.split()),
                'importance': self.SECTION_PATTERNS[current_section['type']]['importance']
            })
        
        print(f"   βœ“ SAS: Detected {len(sections)} sections")
        for sec in sections:
            print(f"      - {sec['type']}: {sec['word_count']} words (importance: {sec['importance']})")
        
        return sections
    
    def summarize_section(self, section: Dict[str, Any], max_retries: int = 3) -> SectionSummary:
        """
        Create section-aware summary with type-specific extraction.
        Includes retry logic to handle Groq tool_use_failed errors.
        """
        section_type = section['type']
        section_content = section['content']
        
        # Type-specific prompts
        type_specific_instructions = {
            'abstract': "Extract the problem, approach, key results, and contributions.",
            'introduction': "Extract motivation, problem statement, main contributions, and paper organization.",
            'methodology': "Extract the approach, algorithms, model architecture, and implementation details.",
            'experiments': "Extract experimental setup, datasets, baselines, evaluation metrics, and protocols.",
            'results': "Extract performance metrics, comparisons with baselines, ablation studies, and key findings.",
            'discussion': "Extract interpretation of results, implications, and insights.",
            'conclusion': "Extract main takeaways, impact, and future directions.",
            'related_work': "Extract prior approaches, their limitations, and how this work differs."
        }
        
        instruction = type_specific_instructions.get(
            section_type, 
            "Extract key points, technical details, and important findings."
        )
        
        required_elements_list = self.SECTION_PATTERNS.get(section_type, {}).get('required_elements', [])
        required_elements_str = ', '.join(required_elements_list)

        prompt = ChatPromptTemplate.from_messages([
            ("system", SAS_PROCESSOR_SYSTEM_PROMPT),
            ("user", """Section Title: {title}
                        Section Type: {type}
                        Content:
                        {content}
                        Provide comprehensive section-aware summary.
                    """)])
        
        structured_llm = self.llm.with_structured_output(SectionSummary)
        chain = prompt | structured_llm
        
        # Retry logic to handle Groq tool_use_failed errors
        last_error = None
        for attempt in range(max_retries):
            try:
                # Progressively truncate content on each retry to reduce complexity
                max_chars = 30000 - (attempt * 5000)  # Increased from 8000
                truncated_content = section_content[:max_chars]
                
                summary = chain.invoke({
                    'title': section['title'],
                    'type': section_type,
                    'content': truncated_content,
                    'section_type': section_type.upper(),
                    'required_elements': required_elements_str,
                    'instruction': instruction
                })
                return summary
                
            except Exception as e:
                last_error = e
                error_str = str(e).lower()
                # Check for Groq tool_use_failed or similar errors
                if 'tool_use_failed' in error_str or 'failed to call a function' in error_str or '400' in str(e):
                    print(f"         ⚠️ Retry {attempt + 1}/{max_retries}: Groq tool_use error, reducing content size...")
                    time.sleep(2)  # Brief pause before retry
                    continue
                else:
                    # For other errors, raise immediately
                    raise
        
        # If all retries failed, create a fallback minimal summary
        print(f"         ⚠️ All retries failed for {section['title']}, creating fallback summary...")
        return SectionSummary(
            section_id=section_type.upper(),
            section_type=section_type,
            section_title=section['title'],
            executive_summary=f"Summary of {section_type} section (processing error occurred).",
            detailed_summary=f"The {section_type} section could not be fully processed due to API limitations. Original content length: {len(section_content)} chars.",
            key_points=[f"Section type: {section_type}"],
            methodological_details=[],
            empirical_findings=[],
            technical_terms=[],
            citations_mentioned=[],
            related_sections=[],
            information_density=0.5,
            novelty_score=0.5
        )
    
    def _group_sections_globally(self, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Group ALL sections of the same type together, regardless of position.
        This ensures we only make ~1 API call per section type (e.g. one for all Methodology).
        """
        if not sections:
            return []
        
        # Dictionary to hold merged sections by type
        merged_map = {}
        
        # Order to preserve the first appearance of each type
        type_order = []
        
        for section in sections:
            s_type = section['type']
            
            if s_type not in merged_map:
                merged_map[s_type] = section.copy()
                merged_map[s_type]['title'] = f"{s_type.capitalize()} (Merged)"
                type_order.append(s_type)
            else:
                # Merge content
                merged_map[s_type]['content'] += "\n\n" + section['content']
                merged_map[s_type]['word_count'] += section['word_count']
        
        # Return list in order of first appearance
        return [merged_map[t] for t in type_order]
    
    def process_all_sections(self, sections: List[Dict[str, Any]]) -> List[SectionSummary]:
        """
        Process all sections with section-aware summarization
        """
        print("   πŸ“ SAS: Generating section-aware summaries...")
        
        # Group ALL sections of the same type globally
        grouped_sections = self._group_sections_globally(sections)
        print(f"   πŸ“ SAS: Globally grouped {len(sections)} sections into {len(grouped_sections)} unique types")
        
        summaries = []
        for i, section in enumerate(grouped_sections):
            print(f"      Processing group {i+1}/{len(grouped_sections)}: {section['type']} ({section['word_count']} words)")
            
            # Handle long sections by chunking (reduced threshold to avoid Groq errors)
            if section['word_count'] > 1500:
                print(f"         (Long section with {section['word_count']} words, chunking...)")
                chunk_summaries = self._chunk_and_summarize(section)
                summaries.extend(chunk_summaries)
            else:
                summary = self.summarize_section(section)
                summaries.append(summary)
            
            # Rate limiting sleep
            time.sleep(10)
        
        print(f"   βœ“ SAS: Generated {len(summaries)} section summaries")
        return summaries
    
    def _chunk_and_summarize(self, section: Dict[str, Any]) -> List[SectionSummary]:
        """
        Chunk long sections and summarize each chunk
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=10000,
            chunk_overlap=500,
            separators=["\n\n", "\n", ". ", " "]
        )
        
        chunks = splitter.split_text(section['content'])
        chunk_summaries = []
        
        for i, chunk in enumerate(chunks):
            chunk_section = {
                'type': section['type'],
                'title': f"{section['title']} (Part {i+1})",
                'content': chunk,
                'start_position': section['start_position'],
                'word_count': len(chunk.split()),
                'importance': section['importance']
            }
            summary = self.summarize_section(chunk_section)
            chunk_summaries.append(summary)
        
        return chunk_summaries


# ============================================================================
# HIERARCHICAL SUMMARIZATION (HAS) ENGINE
# ============================================================================

class HierarchicalSummarizer:
    """
    Hierarchical Summarization (HAS)
    
    Creates multi-level abstractions:
    - Level 1: Detailed (section-level insights)
    - Level 2: Intermediate (cross-section synthesis)
    - Level 3: Executive (high-level overview)
    """
    
    def __init__(self, llm):
        self.llm = llm
    
    def create_level1_summary(
        self, 
        section_summaries: List[SectionSummary]
    ) -> HierarchicalLevel:
        """
        Level 1: Detailed summary from section summaries
        """
        print("   πŸ“Š HAS Level 1: Creating detailed summary...")
        
        # Aggregate all section content
        all_key_points = []
        all_findings = []
        
        for summary in section_summaries:
            all_key_points.extend(summary.key_points)
            all_findings.extend(summary.empirical_findings)
        
        # Create detailed synthesis
        sections_text = "\n\n".join([
            f"**{s.section_type.upper()}**: {s.detailed_summary}"
            for s in section_summaries
        ])
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", HAS_PROCESSOR_L1_SYSTEM_PROMPT),
            ("user", """Section Summaries:
                        {sections}
                        All Key Points: {key_points}
                        Create detailed Level 1 summary that preserves technical depth.""")])
        
        response = self.llm.invoke(prompt.format_messages(
            sections=sections_text[:50000], # Increased from 8000
            key_points=str(all_key_points[:100]) # Increased from 50
        ))
        
        level1 = HierarchicalLevel(
            level=1,
            summary=response.content,
            key_contributions=all_key_points[:50], # Increased from 20
            scope="Detailed section-level analysis with technical specifics"
        )
        
        print("   βœ“ HAS Level 1 complete")
        return level1
    
    def create_level2_summary(
        self, 
        level1: HierarchicalLevel,
        section_summaries: List[SectionSummary]
    ) -> HierarchicalLevel:
        """
        Level 2: Intermediate synthesis across sections
        """
        print("   πŸ“Š HAS Level 2: Creating intermediate synthesis...")
        
        # Group sections by type for cross-section analysis
        methodology_sections = [s for s in section_summaries if s.section_type in ['methodology', 'approach', 'model']]
        results_sections = [s for s in section_summaries if s.section_type in ['results', 'experiments']]
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", HAS_PROCESSOR_L2_SYSTEM_PROMPT),
            ("user", """Level 1 (Detailed):
{level1}

Methodology Insights: {methodology}
Results Insights: {results}

Create intermediate Level 2 summary focusing on main contributions and their validation.""")
        ])
        
        methodology_text = " | ".join([s.executive_summary for s in methodology_sections])
        results_text = " | ".join([s.executive_summary for s in results_sections])
        
        response = self.llm.invoke(prompt.format_messages(
            level1=level1.summary[:20000], # Increased from 3000
            methodology=methodology_text[:10000], # Increased from 1000
            results=results_text[:10000] # Increased from 1000
        ))
        
        level2 = HierarchicalLevel(
            level=2,
            summary=response.content,
            key_contributions=level1.key_contributions[:20], # Increased from 10
            scope="Cross-section synthesis of contributions and findings"
        )
        
        print("   βœ“ HAS Level 2 complete")
        return level2
    
    def create_level3_summary(
        self, 
        level2: HierarchicalLevel,
        section_summaries: List[SectionSummary]
    ) -> HierarchicalLevel:
        """
        Level 3: Executive summary (highest abstraction)
        """
        print("   πŸ“Š HAS Level 3: Creating executive summary...")
        
        # Extract only the most critical information
        abstract_summary = next(
            (s.executive_summary for s in section_summaries if s.section_type == 'abstract'),
            "No abstract found"
        )
        
        contributions = next(
            (s.key_points for s in section_summaries if s.section_type == 'introduction'),
            []
        )
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", HAS_PROCESSOR_L3_SYSTEM_PROMPT),
            ("user", """Abstract: {abstract}
                        Level 2 (Intermediate):
                        {level2}
                        Main Contributions: {contributions}

                        Create concise executive summary answering:
                        1. What problem does this solve?
                        2. What's the proposed solution?
                        3. What are the key results?
                        4. Why does it matter?
                    """)])
        
        response = self.llm.invoke(prompt.format_messages(
            abstract=abstract_summary,
            level2=level2.summary[:20000], # Increased from 2000
            contributions=str(contributions[:20]) # Increased from 5
        ))
        
        level3 = HierarchicalLevel(
            level=3,
            summary=response.content,
            key_contributions=level2.key_contributions[:10], # Increased from 5
            scope="Executive overview for quick understanding"
        )
        
        print("   βœ“ HAS Level 3 complete")
        return level3
    
    def create_hierarchy(
        self, 
        section_summaries: List[SectionSummary]
    ) -> List[HierarchicalLevel]:
        """
        Create complete 3-level hierarchy
        """
        print("\n   πŸ—οΈ  HAS: Building hierarchical summaries...")
        
        level1 = self.create_level1_summary(section_summaries)
        level2 = self.create_level2_summary(level1, section_summaries)
        level3 = self.create_level3_summary(level2, section_summaries)
        
        print("   βœ“ HAS: Complete 3-level hierarchy created")
        
        return [level1, level2, level3]


# ============================================================================
# INTEGRATED SAS + HAS PROCESSOR
# ============================================================================

class SASHASProcessor:
    """
    Integrated Section-Aware + Hierarchical Summarization
    
    Complete pipeline:
    1. Section detection and classification
    2. Section-aware summarization (SAS)
    3. Hierarchical synthesis (HAS)
    4. Final comprehensive analysis
    """
    
    def __init__(self, llm_config: Dict = None):
        self.llm = LLMFactory.get_llm(
            agent="paper_analysis",
            temperature=0.1,
            max_retries=5,
            llm_config=llm_config
        )
        
        self.sas = SectionAwareSummarizer(self.llm)
        self.has = HierarchicalSummarizer(self.llm)
    
    def process_paper(self, paper_content: str) -> ComprehensivePaperAnalysis:
        """
        Complete SAS + HAS processing pipeline
        """
        print("\n" + "="*70)
        print("πŸš€ SAS + HAS PAPER ANALYSIS PIPELINE")
        print("="*70)
        print(f"Paper length: {len(paper_content):,} characters")
        print(f"Estimated words: {len(paper_content.split()):,}")
        
        # ====================================================================
        # PHASE 1: Section-Aware Summarization (SAS)
        # ====================================================================
        print("\nπŸ“‹ PHASE 1: SECTION-AWARE SUMMARIZATION (SAS)")
        print("-" * 70)
        
        # Detect sections
        sections = self.sas.detect_sections(paper_content)
        
        # Summarize each section
        section_summaries = self.sas.process_all_sections(sections)
        
        # ====================================================================
        # PHASE 2: Hierarchical Summarization (HAS)
        # ====================================================================
        print("\nπŸ—οΈ  PHASE 2: HIERARCHICAL SUMMARIZATION (HAS)")
        print("-" * 70)
        
        hierarchy = self.has.create_hierarchy(section_summaries)
        
        # ====================================================================
        # PHASE 3: Final Synthesis
        # ====================================================================
        print("\nπŸ”¬ PHASE 3: FINAL SYNTHESIS")
        print("-" * 70)
        
        final_analysis = self._synthesize_final_analysis(
            paper_content,
            sections,
            section_summaries,
            hierarchy
        )
        
        print("\n" + "="*70)
        print("βœ… SAS + HAS ANALYSIS COMPLETE")
        print("="*70)
        print(f"Total sections analyzed: {len(sections)}")
        print(f"Hierarchical levels: {len(hierarchy)}")
        print(f"Total contributions identified: {len(final_analysis.contributions)}")
        
        return final_analysis
    
    def _synthesize_final_analysis(
        self,
        paper_content: str,
        sections: List[Dict],
        section_summaries: List[SectionSummary],
        hierarchy: List[HierarchicalLevel],
        max_retries: int = 3
    ) -> ComprehensivePaperAnalysis:
        """
        Synthesize everything into final structured output.
        Includes retry logic to handle Groq tool_use_failed errors.
        """
        print("   πŸ”„ Synthesizing final comprehensive analysis...")
        
        # Extract paper metadata
        first_page = paper_content[:2000]
        
        # Aggregate data from sections
        all_contributions = []
        all_methodologies = []
        all_results = []
        all_datasets = []
        all_limitations = []
        all_citations = []
        
        for summary in section_summaries:
            all_contributions.extend(summary.key_points)
            all_methodologies.extend(summary.methodological_details)
            all_results.extend(summary.empirical_findings)
            all_citations.extend(summary.citations_mentioned)
        
        # Build section summaries dict
        section_summaries_dict = {
            s.section_title: s.detailed_summary
            for s in section_summaries
        }
        
        # Create final synthesis prompt
        prompt = ChatPromptTemplate.from_messages([
            ("system", FINAL_SYNTHESIS_PROMPT),
            ("user", """First Page (for metadata):
                        {first_page}

                        HIERARCHICAL SUMMARIES:
                        Level 3 (Executive): {level3}
                        Level 2 (Intermediate): {level2}
                        Level 1 (Detailed): {level1}

                        SECTION DATA:
                        Contributions: {contributions}
                        Methodologies: {methodologies}
                        Results: {results}
                        Citations: {citations}
                        Create comprehensive structured analysis.
                    """)])
        
        structured_llm = self.llm.with_structured_output(ComprehensivePaperAnalysis)
        chain = prompt | structured_llm
        
        # Retry logic to handle Groq tool_use_failed errors
        last_error = None
        for attempt in range(max_retries):
            try:
                # Progressively reduce content on each retry
                first_page_len = 10000 - (attempt * 2000)  # Increased from 2000
                level1_len = 20000 - (attempt * 4000)      # Increased from 2000
                contrib_count = 50 - (attempt * 10)        # Increased from 20
                method_count = 30 - (attempt * 5)          # Increased from 15
                result_count = 30 - (attempt * 5)          # Increased from 15
                citation_count = 50 - (attempt * 10)       # Increased from 20
                
                final_analysis = chain.invoke({
                    "first_page": first_page[:first_page_len],
                    "level3": hierarchy[2].summary[:10000], # Increased from 1500
                    "level2": hierarchy[1].summary[:15000], # Increased from 1500
                    "level1": hierarchy[0].summary[:level1_len],
                    "contributions": str(all_contributions[:contrib_count]),
                    "methodologies": str(all_methodologies[:method_count]),
                    "results": str(all_results[:result_count]),
                    "citations": str(all_citations[:citation_count])
                })
                
                # Enrich with hierarchical summaries and section summaries
                # This overwrites any empty/hallucinated lists from the LLM with the valid ones from previous phases
                final_analysis.hierarchical_summaries = hierarchy
                final_analysis.section_summaries = section_summaries_dict
                final_analysis.total_sections = len(sections)
                
                print("   βœ“ Final synthesis complete")
                return final_analysis
                
            except Exception as e:
                last_error = e
                error_str = str(e).lower()
                # Check for Groq tool_use_failed or similar errors
                if 'tool_use_failed' in error_str or 'failed to call a function' in error_str or '400' in str(e):
                    print(f"   ⚠️ Retry {attempt + 1}/{max_retries}: Groq tool_use error in final synthesis, reducing content...")
                    time.sleep(2)
                    continue
                else:
                    # For other errors, raise immediately
                    raise
        
        # If all retries failed, create a fallback analysis from collected data
        print(f"   ⚠️ All retries failed for final synthesis, creating fallback analysis...")
        
        # Extract title from first page
        title_lines = first_page.split('\n')[:5]
        paper_title = title_lines[0] if title_lines else "Unknown Paper"
        
        fallback_analysis = ComprehensivePaperAnalysis(
            paper_title=paper_title,
            authors=[],
            publication_info="",
            hierarchical_summaries=hierarchy,
            section_summaries=section_summaries_dict,
            abstract_summary=hierarchy[2].summary if len(hierarchy) > 2 else "Analysis completed with partial data.",
            contributions=all_contributions[:20], # Increased from 10
            methodology={"approach": ", ".join(all_methodologies[:10])} if all_methodologies else {}, # Increased from 5
            datasets=[],
            experiments=[],
            results={"findings": ", ".join(all_results[:10])} if all_results else {}, # Increased from 5
            limitations=[],
            future_work=[],
            technical_depth="Moderate (fallback analysis)",
            novelty="See hierarchical summaries for details",
            domain_tags=["Research Paper"],
            code_resources={},
            related_papers=[],
            citations=all_citations[:20], # Increased from 10
            relevance_score=0.7,
            quality_score=0.7,
            total_sections=len(sections)
        )
        
        print("   βœ“ Fallback synthesis complete")
        return fallback_analysis