Spaces:

Marek4321
/

QualiLab

Sleeping

File size: 16,173 Bytes
# report_generator.py - Inteligentny generator raportów z self-prompting

import time
import streamlit as st
from typing import Dict, List, Optional, Tuple
from datetime import datetime

try:
    from openai import OpenAI
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False
    st.error("❌ OpenAI library nie jest dostępna")

from config import REPORT_PROMPTS, MODEL_SETTINGS, INTERVIEW_TYPES

class ReportGenerator:
    """Inteligentny generator długich raportów badawczych z self-prompting"""
    
    def __init__(self, api_key: str):
        if not OPENAI_AVAILABLE:
            raise Exception("OpenAI library nie jest dostępna")
            
        self.client = OpenAI(api_key=api_key)
        self.api_key = api_key
        self.generation_stats = {
            'sections_generated': 0,
            'sections_expanded': 0,
            'total_tokens_used': 0,
            'total_cost_estimate': 0,
            'generation_time': 0
        }
        
    def generate_comprehensive_report(self, transcriptions: Dict[str, str], brief: str = "") -> str:
        """
        Główna funkcja generowania kompletnego raportu
        Używa strategii wieloetapowej z self-prompting
        """
        start_time = time.time()
        
        try:
            st.info("📋 Rozpoczynam generowanie raportu...")
            
            # Przygotuj dane
            combined_transcriptions = self._combine_transcriptions(transcriptions)
            interview_type = self._detect_interview_type(combined_transcriptions)
            
            st.info(f"🔍 Wykryto typ: {INTERVIEW_TYPES.get(interview_type, 'nieznany')}")
            
            # ETAP 1: Generowanie outline'u
            st.info("📝 Etap 1/4: Tworzenie struktury raportu...")
            outline = self._generate_outline(combined_transcriptions, brief, interview_type)
            
            if not outline:
                raise Exception("Nie udało się wygenerować struktury raportu")
            
            # ETAP 2: Generowanie sekcji po sekcji
            st.info("✍️ Etap 2/4: Generowanie treści sekcji...")
            sections = self._generate_sections_iteratively(
                outline, combined_transcriptions, brief, interview_type
            )
            
            # ETAP 3: Rozszerzanie zbyt krótkich sekcji (self-prompting)
            st.info("🔍 Etap 3/4: Pogłębianie analizy...")
            expanded_sections = self._expand_short_sections(
                sections, combined_transcriptions, brief
            )
            
            # ETAP 4: Finalne scalenie z wprowadzeniem i podsumowaniem
            st.info("📄 Etap 4/4: Finalne scalenie...")
            final_report = self._assemble_final_report(
                expanded_sections, brief, interview_type, len(transcriptions)
            )
            
            # Statystyki
            self.generation_stats['generation_time'] = time.time() - start_time
            
            st.success(f"🎉 Raport wygenerowany! ({self.generation_stats['generation_time']:.1f}s)")
            self._log_generation_stats()
            
            return final_report
            
        except Exception as e:
            st.error(f"❌ Błąd generowania raportu: {str(e)}")
            raise e
    
    def _combine_transcriptions(self, transcriptions: Dict[str, str]) -> str:
        """Połącz wszystkie transkrypcje w jeden tekst z oznaczeniami"""
        combined = []
        
        for i, (filename, transcription) in enumerate(transcriptions.items(), 1):
            header = f"\n\n=== WYWIAD {i}: {filename} ===\n\n"
            combined.append(header + transcription)
        
        return "\n".join(combined)
    
    def _detect_interview_type(self, transcriptions: str) -> str:
        """Automatyczne rozpoznanie typu wywiadu"""
        text_lower = transcriptions.lower()
        
        # Wskaźniki FGI
        fgi_indicators = [
            'moderator', 'grupa', 'wszyscy', 'uczestnicy', 'dyskusja',
            'czy zgadzacie się', 'co myślicie', 'focus group'
        ]
        
        # Wskaźniki IDI  
        idi_indicators = [
            'wywiad indywidualny', 'jeden na jeden', 'interviewer',
            'opowiedz mi', 'jak się czujesz', 'twoje doświadczenie'
        ]
        
        fgi_score = sum(1 for indicator in fgi_indicators if indicator in text_lower)
        idi_score = sum(1 for indicator in idi_indicators if indicator in text_lower)
        
        if fgi_score > idi_score:
            return 'fgi'
        elif idi_score > fgi_score:
            return 'idi'
        else:
            return 'auto'
    
    def _generate_outline(self, transcriptions: str, brief: str, interview_type: str) -> Dict:
        """Generuj strukturę raportu"""
        try:
            prompt = REPORT_PROMPTS['outline_generator'].format(
                transcriptions=transcriptions[:8000],  # Limit dla API
                brief=brief or "Brak szczegółowego briefu",
                interview_type=INTERVIEW_TYPES.get(interview_type, 'wywiad')
            )
            
            response = self._call_gpt(prompt)
            outline = self._parse_outline(response)
            
            st.success(f"✅ Outline: {len(outline)} sekcji zaplanowanych")
            return outline
            
        except Exception as e:
            st.error(f"❌ Błąd generowania outline: {e}")
            return {}
    
    def _generate_sections_iteratively(self, outline: Dict, transcriptions: str, brief: str, interview_type: str) -> Dict:
        """Generuj sekcje raportu jedna po drugiej"""
        sections = {}
        
        for section_title, section_points in outline.items():
            if not section_title or section_title.startswith('#'):
                continue
                
            st.info(f"📝 Generuję: {section_title}")
            
            try:
                prompt = REPORT_PROMPTS['section_generator'].format(
                    transcriptions=transcriptions,
                    brief=brief or "Brak szczegółowego briefu",
                    interview_type=INTERVIEW_TYPES.get(interview_type, 'wywiad'),
                    outline=str(outline),
                    section_title=section_title,
                    section_points=section_points
                )
                
                section_content = self._call_gpt(prompt)
                sections[section_title] = section_content
                
                self.generation_stats['sections_generated'] += 1
                st.success(f"✅ {section_title} ({len(section_content.split())} słów)")
                
                # Krótka przerwa żeby nie przekroczyć rate limits
                time.sleep(2)
                
            except Exception as e:
                st.warning(f"⚠️ Błąd sekcji '{section_title}': {e}")
                sections[section_title] = f"[BŁĄD GENEROWANIA SEKCJI: {e}]"
        
        return sections
    
    def _expand_short_sections(self, sections: Dict, transcriptions: str, brief: str) -> Dict:
        """Self-prompting: rozszerz zbyt krótkie sekcje"""
        expanded_sections = {}
        
        for section_title, section_content in sections.items():
            word_count = len(section_content.split())
            
            # Sprawdź czy sekcja wymaga rozszerzenia
            if word_count < 500:  # Za krótka sekcja
                st.info(f"🔍 Rozszerzam: {section_title} ({word_count} słów)")
                
                try:
                    prompt = REPORT_PROMPTS['section_expander'].format(
                        current_section=section_content,
                        transcriptions=transcriptions,
                        brief=brief or "Brak szczegółowego briefu"
                    )
                    
                    expanded_content = self._call_gpt(prompt)
                    expanded_sections[section_title] = expanded_content
                    
                    new_word_count = len(expanded_content.split())
                    self.generation_stats['sections_expanded'] += 1
                    
                    st.success(f"✅ Rozszerzone: {section_title} ({word_count} → {new_word_count} słów)")
                    
                    time.sleep(2)  # Rate limit protection
                    
                except Exception as e:
                    st.warning(f"⚠️ Nie udało się rozszerzyć '{section_title}': {e}")
                    expanded_sections[section_title] = section_content
            else:
                # Sekcja już wystarczająco długa
                expanded_sections[section_title] = section_content
                st.success(f"✅ {section_title} OK ({word_count} słów)")
        
        return expanded_sections
    
    def _assemble_final_report(self, sections: Dict, brief: str, interview_type: str, interviews_count: int) -> str:
        """Scal wszystko w finalny raport"""
        try:
            sections_text = "\n\n".join([
                f"## {title}\n\n{content}" 
                for title, content in sections.items()
            ])
            
            prompt = REPORT_PROMPTS['final_assembly'].format(
                sections=sections_text,
                brief=brief or "Brak szczegółowego briefu",
                interview_type=INTERVIEW_TYPES.get(interview_type, 'wywiad'),
                interviews_count=interviews_count,
                date=datetime.now().strftime("%Y-%m-%d")
            )
            
            final_report = self._call_gpt(prompt, max_tokens=4000)
            
            # Dodaj metadane na koniec
            metadata = f"""

---

## METADATA RAPORTU
- **Wygenerowano**: {datetime.now().strftime("%Y-%m-%d %H:%M")}
- **Typ badania**: {INTERVIEW_TYPES.get(interview_type, 'nieznany')}
- **Liczba wywiadów**: {interviews_count}
- **Sekcji wygenerowanych**: {self.generation_stats['sections_generated']}
- **Sekcji rozszerzonych**: {self.generation_stats['sections_expanded']}
- **Czas generowania**: {self.generation_stats['generation_time']:.1f}s
- **Generator**: FGI/IDI Research Analyzer v1.0
"""
            
            return final_report + metadata
            
        except Exception as e:
            st.error(f"❌ Błąd finalnego scalenia: {e}")
            # Fallback - zwróć przynajmniej sekcje
            return self._create_fallback_report(sections, brief, interview_type)
    
    def _call_gpt(self, prompt: str, max_tokens: int = 3000) -> str:
        """Wywołanie GPT API z error handling"""
        try:
            response = self.client.chat.completions.create(
                model=MODEL_SETTINGS['gpt']['model'],
                messages=[
                    {"role": "system", "content": "Jesteś ekspertem analizy badań jakościowych. Tworzysz profesjonalne, szczegółowe raporty badawcze."},
                    {"role": "user", "content": prompt}
                ],
                temperature=MODEL_SETTINGS['gpt']['temperature'],
                max_tokens=max_tokens
            )
            
            # Statystyki
            if hasattr(response, 'usage'):
                self.generation_stats['total_tokens_used'] += response.usage.total_tokens
                # Estymacja kosztu GPT-4o-mini: ~$0.00015 per 1K tokens
                self.generation_stats['total_cost_estimate'] += (response.usage.total_tokens / 1000) * 0.00015
            
            return response.choices[0].message.content
            
        except Exception as e:
            if "rate limit" in str(e).lower():
                st.warning("⏳ Rate limit - czekam 60s...")
                time.sleep(60)
                return self._call_gpt(prompt, max_tokens)
            else:
                raise e
    
    def _parse_outline(self, outline_text: str) -> Dict:
        """Parsuj outline z odpowiedzi GPT"""
        outline = {}
        current_section = None
        
        for line in outline_text.split('\n'):
            line = line.strip()
            
            if line.startswith('## '):
                # Nowa sekcja
                current_section = line[3:].strip()
                outline[current_section] = []
            elif line.startswith('- ') and current_section:
                # Podpunkt sekcji
                outline[current_section].append(line[2:].strip())
        
        return outline
    
    def _create_fallback_report(self, sections: Dict, brief: str, interview_type: str) -> str:
        """Fallback raport jeśli final assembly nie zadziała"""
        report_parts = [
            f"# RAPORT Z BADANIA {INTERVIEW_TYPES.get(interview_type, 'INTERVIEW').upper()}",
            f"\n**Data**: {datetime.now().strftime('%Y-%m-%d')}",
            f"**Brief**: {brief or 'Brak szczegółowego briefu'}",
            "\n---\n"
        ]
        
        for title, content in sections.items():
            report_parts.append(f"## {title}\n\n{content}\n\n")
        
        return "\n".join(report_parts)
    
    def _log_generation_stats(self):
        """Wyświetl statystyki generowania"""
        stats = self.generation_stats
        
        st.info(f"""
📊 **Statystyki generowania:**
- Sekcji: {stats['sections_generated']} wygenerowanych, {stats['sections_expanded']} rozszerzonych
- Tokeny: ~{stats['total_tokens_used']:,}
- Koszt: ~${stats['total_cost_estimate']:.4f}
- Czas: {stats['generation_time']:.1f}s
        """)
    
    def evaluate_section_quality(self, section_content: str) -> Dict:
        """Oceń jakość sekcji (dla debugowania)"""
        try:
            prompt = REPORT_PROMPTS['quality_checker'].format(section=section_content)
            evaluation = self._call_gpt(prompt, max_tokens=500)
            
            # Parsuj ocenę (uproszczone)
            lines = evaluation.split('\n')
            scores = {}
            
            for line in lines:
                if ':' in line and '/10' in line:
                    criterion = line.split(':')[0].strip()
                    score = line.split(':')[1].strip().split('/')[0]
                    try:
                        scores[criterion] = int(score)
                    except:
                        pass
            
            needs_improvement = 'TAK' in evaluation.upper()
            
            return {
                'scores': scores,
                'needs_improvement': needs_improvement,
                'evaluation_text': evaluation
            }
            
        except Exception as e:
            return {'error': str(e)}
    
    def get_generation_stats(self) -> Dict:
        """Zwróć statystyki generowania"""
        return self.generation_stats.copy()

# Funkcje pomocnicze
def estimate_report_length(transcriptions: Dict[str, str]) -> Dict:
    """Estymuj długość finalnego raportu"""
    total_words = sum(len(text.split()) for text in transcriptions.values())
    
    # Raporty są zwykle 15-25% długości transkrypcji
    estimated_report_words = int(total_words * 0.2)
    estimated_pages = estimated_report_words / 250  # ~250 słów na stronę
    
    return {
        'transcription_words': total_words,
        'estimated_report_words': estimated_report_words,
        'estimated_pages': estimated_pages,
        'estimated_generation_time': len(transcriptions) * 120  # ~2 min per interview
    }

# Test modułu
if __name__ == "__main__":
    print("🧪 Test ReportGenerator")
    
    # Test bez prawdziwego API
    try:
        generator = ReportGenerator("test-key")
        print("✅ ReportGenerator zainicjalizowany")
        
        # Test estymacji
        test_transcriptions = {
            "test1.mp3": "To jest przykładowa transkrypcja wywiadu. " * 100,
            "test2.mp3": "To jest druga transkrypcja z badania. " * 150
        }
        
        estimates = estimate_report_length(test_transcriptions)
        print(f"📊 Estymacja: {estimates['estimated_report_words']} słów, {estimates['estimated_pages']:.1f} stron")
        
    except Exception as e:
        print(f"❌ Błąd testu: {e}")
    
    print("✅ Test zakończony")