File size: 4,223 Bytes
6236d4d
a811bb7
 
 
6236d4d
a811bb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
import time
import re
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# 1. PAGE CONFIGURATION
st.set_page_config(page_title="WhatsApp Chat Analyzer", page_icon="πŸ“±", layout="wide")

# 2. LOAD MODEL FROM HUGGING FACE
# We use @st.cache_resource so it only downloads once
@st.cache_resource
def load_pipeline():
    model_id = "AishaniS/text_summarizer"  # Your specific HF repository
    
    try:
        # Load directly from the Hub
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
        return pipeline("summarization", model=model, tokenizer=tokenizer)
    except Exception as e:
        st.error(f"Error loading model from Hugging Face: {e}")
        return None

summarizer = load_pipeline()

# 3. PREPROCESSING FUNCTION (Corrected for your Date/Time format)
def clean_whatsapp_log(text):
    """
    Parses WhatsApp chat.
    Target format: "24/12/25, 09:38 - Name: Message"
    """
    # Regex Breakdown:
    # \d{1,2}/\d{1,2}/\d{2,4}  -> Date (e.g., 24/12/25)
    # ,\s                      -> Comma and space
    # \d{1,2}:\d{2}            -> Time (e.g., 09:38 or 20:43)
    # \s-\s                    -> " - " separator
    pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
    
    clean_lines = []
    lines = text.split('\n')
    
    for line in lines:
        # Filter system messages
        if "<Media omitted>" in line or "Messages and calls are end-to-end encrypted" in line:
            continue
            
        # Remove timestamp
        cleaned_line = re.sub(pattern, '', line).strip()
        
        # Only add if text remains
        if cleaned_line:
            clean_lines.append(cleaned_line)
            
    return "\n".join(clean_lines)

# 4. CHUNKING FUNCTION (To handle long chats)
def chunk_text(text, max_chars=2000):
    chunks = []
    current_chunk = ""
    for line in text.split('\n'):
        if len(current_chunk) + len(line) < max_chars:
            current_chunk += line + "\n"
        else:
            chunks.append(current_chunk)
            current_chunk = line + "\n"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

# 5. MAIN UI
st.title("πŸ“± Real-Time WhatsApp Summarizer")
st.markdown(f"**Model:** `AishaniS/text_summarizer` | **Status:** {'βœ… Loaded' if summarizer else '❌ Error'}")
st.markdown("Upload your exported `_chat.txt` file to analyze conversation.")

uploaded_file = st.file_uploader("Choose a file", type=['txt'])

if uploaded_file and summarizer:
    raw_text = uploaded_file.getvalue().decode("utf-8")
    
    # Preprocess
    clean_text = clean_whatsapp_log(raw_text)
    
    # Layout: Two columns
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("πŸ“œ Processed Chat")
        st.text_area("Cleaned Input", clean_text, height=400)
    
    with col2:
        st.subheader("πŸ€– AI Summary")
        if st.button("Generate Summary"):
            if not clean_text:
                st.warning("Chat is empty after cleaning. Check the file format.")
            else:
                with st.spinner("Analyzing..."):
                    start_time = time.time() # Latency Timer Start
                    
                    # Generate
                    chunks = chunk_text(clean_text)
                    summary_parts = []
                    
                    # Summarize first 3 chunks to keep it fast
                    for i, chunk in enumerate(chunks[:3]):
                        try:
                            res = summarizer(chunk, max_length=128, min_length=30, do_sample=False)
                            summary_parts.append(res[0]['summary_text'])
                        except Exception as e:
                            st.warning(f"Could not summarize chunk {i+1}: {e}")
                    
                    final_summary = " ".join(summary_parts)
                    
                    end_time = time.time() # Latency Timer End
                    latency = end_time - start_time
                    
                    st.success(final_summary)
                    st.info(f"⏱️ Model Latency: {latency:.2f} seconds")