import math
import numpy as np
import streamlit as st
from tomlkit import datetime
from engine import AnalyzerEngine
import pandas as pd
import datetime
import plotly.graph_objects as go
import os
COUNTER_FILE = "/data/visits.txt"
def get_and_increment_visits():
# Verifica che la cartella persistente esista
if not os.path.exists("/data"):
return "Non abilitato"
# Legge il valore attuale
if os.path.exists(COUNTER_FILE):
with open(COUNTER_FILE, "r") as f:
content = f.read().strip()
visits = int(content) if content.isdigit() else 0
else:
visits = 0
# Incrementa e salva
visits += 1
with open(COUNTER_FILE, "w") as f:
f.write(str(visits))
return visits
# Esegui l'incremento solo una volta per sessione
if 'visit_counted' not in st.session_state:
st.session_state.total_visits = get_and_increment_visits()
st.session_state.visit_counted = True
# 1. Configurazione della pagina
st.set_page_config(page_title="Co\u00B3 Suite", layout="wide")
st.markdown("""
""", unsafe_allow_html=True)
# 2. Caricamento del motore con Cache
@st.cache_resource
def load_full_engine():
return AnalyzerEngine()
engine = load_full_engine()
# 3. Inizializzazione dello stato della sessione
if 'res' not in st.session_state:
st.session_state.res = None
def prepare_export_data(res, fig_res=None):
rows = []
# Mappatura dei dizionari principali (Tab 1, 2, 3, 4)
categories = {
"Statistiche Base": res.get("basic", {}),
"Lessico": res.get("lexical", {}),
"Sintassi e Verbi": res.get("syntax", {}),
"Coesione": res.get("cohesion", {}),
"Consecutio": res.get("consecutio", {}),
"Combined": res.get("combined", {})
}
for cat_name, dict_data in categories.items():
for key, value in dict_data.items():
# Escludiamo liste lunghe (come i testi delle frasi o il CoNLL)
if isinstance(value, (int, float, str)) and key not in ["texts", "conll", "doc"]:
rows.append({"Categoria": cat_name, "Metrica": key, "Valore": value})
# Gestione specifica per i Connettori (Tab 4 - Dizionario nidificato)
connectors = res.get("cohesion", {}).get("connectors", {})
for conn_type, val in connectors.items():
rows.append({"Categoria": "Connettori", "Metrica": f"Conn_{conn_type}", "Valore": val})
# Gestione BERT (Tab 5)
bert_metrics = ["mds_s", "mds_w", "total"]
for m in bert_metrics:
val = fig_res.get(m, 0) if fig_res else 0
rows.append({"Categoria": "Figuratività (BERT)", "Metrica": m, "Valore": val})
return pd.DataFrame(rows)
def clear_all():
st.session_state.res = None
st.session_state.input_text = ""
st.title("📝 Co\u00B3 Suite")
st.caption("Hybrid Text Analysis Dashboard with parameters for Coherence, Cohesion and Complexity evaluation powered by a custom-built engine and BERT.")
lang_choice = st.radio("Language:", ["English", "Italiano"], horizontal=True)
st.session_state.lang = "it" if lang_choice == "Italiano" else "en"
# 4. Area di input
uploaded_file = st.file_uploader("Upload a text file (.txt)", type=("txt"))
if uploaded_file is not None:
# Leggiamo il contenuto del file
stringio = uploaded_file.getvalue().decode("utf-8")
# Aggiorniamo lo stato della sessione per visualizzarlo nella text_area
st.session_state.input_text = stringio
if len(stringio) > 30000:
st.session_state.input_text = stringio[:30000] # Troncamento del testo per sicurezza
st.session_state.source_name = uploaded_file.name.replace(".txt", "")
else:
# Se non c'è un file, verifichiamo se c'è testo nell'area
if st.session_state.get('input_text', "").strip():
st.session_state.source_name = "CustomText"
txt = st.text_area("Insert Text (max 30,000 characters):",
height=200,
max_chars=30000,
key="input_text")
input_final = st.session_state.input_text if uploaded_file is None else stringio
col_btn1, col_btn2, _ = st.columns([1, 1, 4])
with col_btn1:
if st.button("Start Analysis", type="primary"):
if len(input_final) > 30000:
input_text = st.session_state.input_text[:30000]
st.error(f"The text is too long ({len(input_final)} characters). The maximum limit is 30,000.")
if st.session_state.input_text.strip():
st.session_state.pop("tab6_results", None)
with st.spinner("Analyzing..."):
full_res = engine.run(st.session_state.input_text,lang=st.session_state.lang)
# Salviamo i risultati testuali
st.session_state.res = full_res
# Salviamo l'oggetto DOC separatamente per BERT
st.session_state.nlp_doc = full_res["doc"]
else:
st.warning("Please insert some text.")
with col_btn2:
st.button("Clean Fields", on_click=clear_all)
# 5. Visualizzazione dei 34 Output
if st.session_state.res:
r = st.session_state.res
b, l, s, c , d = r["basic"], r["lexical"], r["syntax"], r["cohesion"], r['consecutio']
t1, t2, t3, t4, t5, t6, t7 = st.tabs([
"📊 Basical Statistics",
"📚 Lexical features ",
"🏗️ Sintax & Verbs",
"🧠 Cohesion & Connectives",
"🎭 Figurative Language",
"📈 Combined Metrics",
"📖 ILA Index"
])
# --- TAB 1: STATISTICHE BASE & RICCHEZZA ---
with t1:
st.subheader("Quantitative Basics")
c1, c2, c3, c4 = st.columns(4)
c1.metric("Total Tokens", b["tokens"])
c2.metric("Total Sentences", b["sentences"])
c3.metric("Total Paragraphs", b["paragraphs"])
c4.metric("Hapax Legomena", l.get("hapax", 0), help="Number of words that appear only once in the text.")
c1.metric("Hapax Ratio", l.get("hapax_ratio", 0))
c2.metric("Type-Token Ratio (TTR)", l.get("TTR", 0), help="Ratio of unique words (types) to total words (tokens).")
c3.metric("Gunning Fog Index", l.get("gunning_fog", 0), help="An estimate of the years of formal education needed to understand the text on a first reading. This metric is based on the average sentence length and the percentage of complex words (words with three or more syllables).")
c4.metric("HD-D Index", l.get("HD-D", 0), help="A measure of lexical diversity that accounts for the frequency of word usage. It is calculated as the ratio of the number of unique words (types) to the total number of words (tokens) raised to the power of 0.5. A higher HD-D index indicates greater lexical diversity, while a lower index suggests more repetition in word usage.")
st.divider()
st.subheader("Lexical Range Metrics and Concreteness")
col_r1, col_r2, col_r3, col_r4 = st.columns(4)
col_r1.metric("Lexical Range 1", l.get("r1", 0),help="Lexical Range 1: Percentage of words that are among the 1000 most common words in the language according to the dictionaries of the Nation’s Range program. A lower percentage indicates a wider lexical range, while a higher percentage suggests a more limited vocabulary.")
col_r2.metric("Lexical Range 2", l.get("r2", 0), help="Lexical Range 2: Percentage of words that are among the 2000 most common words in the language according to the dictionaries of the Nation’s Range program. ")
col_r3.metric("Lexical Range 3", l.get("r3", 0), help="Lexical Range 3: Percentage of words percentage of words belonging to \"The Academic Word List\"")
col_r4.metric("Concreteness (MRC)", round(l.get("concreteness", 0), 2), help="Average concreteness rating of the words in the text based on the MRC Psycholinguistic Database. Concreteness ratings range from 100 (very abstract) to 700 (very concrete). A higher average concreteness score indicates that the text contains more concrete and tangible words, while a lower score suggests a more abstract vocabulary.")
st.divider()
st.subheader("🔗 Bigrams & Trigrams (PMI)")
# Recupera i lemmi dal doc salvato
if "nlp_doc" in st.session_state:
current_doc = st.session_state.nlp_doc
all_lemmas = [t.lemma_.lower() for t in current_doc if not t.is_punct and not t.is_space]
c_bi, c_tri = st.columns(2)
with c_bi:
df_bi = engine.freq_mod.get_pmi(all_lemmas, n=2)
st.caption("Top Bigrams")
st.dataframe(df_bi, use_container_width=True, hide_index=True)
if not df_bi.empty:
st.download_button("📥 Download Bigrams", df_bi.to_csv(index=False).encode('utf-8'), "bigrams.csv", "text/csv", key="dl_bi")
with c_tri:
df_tri = engine.freq_mod.get_pmi(all_lemmas, n=3)
st.caption("Top Trigrams")
st.dataframe(df_tri, use_container_width=True, hide_index=True)
if not df_tri.empty:
st.download_button("📥 Download Trigrams", df_tri.to_csv(index=False).encode('utf-8'), "trigrams.csv", "text/csv", key="dl_tri")
st.divider()
st.subheader("📈 Frequencies & TF-IDF Trends")
# Controlli UI per la divisione
col_split1, col_split2 = st.columns([1, 3])
split_mode = col_split1.radio("Split Method:", ["numeric", "regex"])
if split_mode == "numeric":
split_val = col_split2.number_input("Number of parts:", min_value=2, max_value=20, value=5)
else:
split_val = col_split2.text_input("Regex pattern (e.g. \\n\\n for paragraphs):", value="\n\n")
# Calcolo dei trend on the fly
chunks = engine.freq_mod.chunk_doc(st.session_state.input_text, current_doc, mode=split_mode, val=split_val)
df_freq, df_tfidf = engine.freq_mod.get_trends(chunks)
if not df_freq.empty:
# Seleziona le parole più frequenti come default
top_overall = df_freq.groupby("Word")["Count"].sum().nlargest(5).index.tolist()
selected_words = st.multiselect("Select words to plot:",
options=df_freq["Word"].unique(),
default=top_overall)
if selected_words:
plot_data_freq = df_freq[df_freq["Word"].isin(selected_words)]
plot_data_tfidf = df_tfidf[df_tfidf["Word"].isin(selected_words)]
# Grafico Frequenze Assolute
fig_f = go.Figure()
for word in selected_words:
w_data = plot_data_freq[plot_data_freq["Word"] == word]
fig_f.add_trace(go.Scatter(x=w_data["Part"], y=w_data["Count"], mode='lines+markers', name=word))
fig_f.update_layout(title="Absolute Frequencies per Chunk", xaxis_title="Chunk", yaxis_title="Count")
# Grafico TF-IDF
fig_t = go.Figure()
for word in selected_words:
w_data = plot_data_tfidf[plot_data_tfidf["Word"] == word]
fig_t.add_trace(go.Scatter(x=w_data["Part"], y=w_data["TF-IDF"], mode='lines+markers', name=word, line=dict(dash='dot')))
fig_t.update_layout(title="TF-IDF per Chunk", xaxis_title="Chunk", yaxis_title="TF-IDF Score")
st.plotly_chart(fig_f, use_container_width=True)
st.plotly_chart(fig_t, use_container_width=True)
# Bottoni di download per i dati completi
st.divider()
col_dl1, col_dl2 = st.columns(2)
col_dl1.download_button(
"📥 Download Complete Frequencies",
df_freq.to_csv(index=False).encode('utf-8'),
"frequecies.csv", "text/csv", key="dl_freq"
)
col_dl2.download_button(
"📥 Scarica TF-IDF Completo",
df_tfidf.to_csv(index=False).encode('utf-8'),
"tfidf.csv", "text/csv", key="dl_tfidf"
)
else:
st.info("Select at least one word to display the charts.")
# --- TAB 2: LESSICO & DEISSI ---
with t2:
st.subheader("Pronouns and Nouns")
l1, l2, l3, l4 = st.columns(4)
l1.metric("Pronouns", l.get("pronouns", 0))
l2.metric("Nouns", l.get("nouns", 0))
l3.metric("Ratio Pronouns/Nouns", l.get("pron_noun_ratio", 0))
l4.metric("First Person Pronouns", l.get("first_person_ratio", 0))
l1.metric("Modifiers per Noun", s.get("mod_per_noun", 0),help="Average number of modifiers (adjectives, relative clauses, etc.) per noun in the text. A higher value indicates a more descriptive and detailed use of nouns, while a lower value suggests a simpler noun usage.")
st.divider()
st.subheader("Adjectives and Emphatic particles")
d1, d2, d3, d4 = st.columns(4)
d1.metric("Deictics", l.get("deictics", 0))
d2.metric("Adjectives", s.get("adj_count", 0))
d3.metric("Adj per Sentence", s.get("adj_x_sent", 0))
d4.metric("Emphatic Particles", l.get("emphatic_particles", 0))
d1.metric("Deictic/Articles", l.get("deictic_Frequency", 0))
d2.metric("attributive/Adj Ratio", s.get("attr_adjs_ratio", 0))
d3.metric("attributive Adj Frequency", s.get("attr_adjs_freq", 0))
st.divider()
st.subheader("Articles")
j1, j2, j3, j4 = st.columns(4)
j1.metric("Articles", l.get("articles", 0))
j2.metric("Definite Articles", l.get("definite_articles", 0))
j3.metric("Demonstratives articles", l.get("demonstratives", 0))
j3.metric("Demonstratives per sentence", l.get("demonstratives_ratio", 0))
j2.metric("Definite articles per sentence", l.get("definite_articles_ratio", 0))
# --- TAB 3: SINTASSI & VERBI ---
with t3:
st.subheader("Syntactic Measures")
s1, s2, s3, s4 = st.columns(4)
s1.metric("Average Sentence Length", s.get("avg_sent_len", 0))
s2.metric("Subordinate per Sentence", s.get("sub_ratio", 0))
s3.metric("Relatives per Sentence", s.get("rel_clauses_per_sent", 0))
s4.metric("Distance from Root", s.get("root_dist", 0),help="Average distance from the root of the dependency tree to the other nodes. A higher value indicates a more complex syntactic structure, while a lower value suggests a simpler structure.")
s1.metric("Hypotactic depth", d.get("avg_depth", 0),help="Average depth of the dependency tree.")
s2.metric("sentence depth variance", round(d.get("sentence_depths", {}), 4), help="Variance of the depths of the dependency trees across sentences. A higher variance indicates greater variability in sentence complexity, while a lower variance suggests more uniformity in sentence structure.")
s1.metric("Punctuation Pairs per Sentence", s.get("punct_pairs_per_sent", 0),help="Number of punctuation pairs (e.g., parentheses, quotes) per sentence.")
s2.metric("Subj-Verb_Obj Inversions per sentence", s.get("svo_inversions_per_sent", 0),help="Number of subj-verb-obj inversions per sentence.")
s3.metric("Number of Subordinate Clauses (completive excluded)", s.get("non_comp_sub_per_sent", 0),help="Number of non-completive subordinate clauses per sentence.")
st.divider()
st.subheader("Verbs & Tenses")
v1, v2, v3, v4 = st.columns(4)
v1.metric("Ratio Present/Verbs", s.get("present_ratio", 0))
v2.metric("Ratio Past/Verbs", s.get("past_ratio", 0))
v3.metric("Ratio Participles/Verbs", s.get("participle_ratio", 0))
v4.metric("Consecutio Index", d.get("consecutio_index", 0),help="Index measuring the sequential relationship between clauses.")
v1.metric("Temporal Stability", f"{d.get('tense_stability', 0)}", help="Percentage of verbs that maintain the same tense across the text. A higher percentage indicates greater temporal stability, while a lower percentage suggests more frequent tense shifts.")
v2.metric("Verbal Density", f"{d.get('verb_density', 0)}",help="Percentage of words that are verbs in the text. A higher percentage indicates a more verb-heavy text, while a lower percentage suggests a less verb-heavy text.")
st.divider()
st.subheader("Dependency Tree Inspection (CoNLL-U)")
# Selettore della frase per visualizzare il relativo CoNLL-U
idx = st.selectbox(
"Select a sentence to inspect:",
range(len(b["texts"])),
format_func=lambda i: f"Frase {i+1}: {b['texts'][i][:70]}..."
)
# Area di testo per mostrare il contenuto generato dall'engine
st.text_area(
label="CoNLL-U Format (Tab-Separated):",
value=b["conll"][idx],
height=300
)
# --- TAB 4: COESIONE & CONNETTIVI ---
with t4:
st.subheader("Textual Cohesion Metrics")
m1, m2, m3, m4 = st.columns(4)
m1.metric("Lemma Overlap (adjacent)", f"{c.get('lexical_cohesion_local', 0)*100:.2f}%",help="Percentage of lemmas that are shared between adjacent paragraphs. A higher percentage indicates stronger local cohesion, while a lower percentage suggests weaker local cohesion.")
m2.metric("Lemma Overlap (3 paragraphs)", f"{c.get('lexical_cohesion_global', 0)*100:.2f}%",help="Percentage of lemmas that are shared between sentences in different paragraphs. ")
m3.metric("Semantic Overlap (Sentences)", f"{c.get('semantic_cohesion_sentences', 0)*100:.2f}%",help="Percentage of semantic relationships that are shared between adjacent sentences. This metric is calculated by using BERT to identify similarities in meaning between sentences. ")
m4.metric("Semantic Overlap (Paragraphs)", f"{c.get('semantic_cohesion_paragraphs', 0)*100:.2f}%",help="Percentage of semantic relationships that are shared between sentences in different paragraphs. ")
st.divider()
st.subheader("Connector Frequency (Normalized)")
# Mapping per visualizzare tutti gli 8 tipi richiesti
conn_data = c.get("connectors", {})
col_c1, col_c2 = st.columns(2)
with col_c1:
st.bar_chart(pd.Series({
"Adictives +": conn_data.get("AdPos", 0),
"Adictives -": conn_data.get("AdNeg", 0),
"Causals +": conn_data.get("CausPos", 0),
"Causals -": conn_data.get("CausNeg", 0)
}))
with col_c2:
st.bar_chart(pd.Series({
"Temporals +": conn_data.get("TempPos", 0),
"Temporals -": conn_data.get("TempNeg", 0),
"Logics +": conn_data.get("LogPos", 0),
"Logics -": conn_data.get("LogNeg", 0)
}))
st.info(f"Cohesion Value\n\n{c.get('general_cohesion', 0)}\n\nThe logarithm of the standard deviation, using the weighted sum of frequencies as the base.")
# Tab 5: FIGURATIVE LANGUAGE ANALYSIS
with t5:
st.header("🎭 Figurative Language Analysis (BERT V5)", help="The analysis may take some time to complete. Large texts may require several minutes.")
if "nlp_doc" not in st.session_state:
st.warning("Before proceeding, run the general analysis in Tab 1.")
else:
# Creiamo tre colonne: una per il tasto, una per la spunta, una per lo slider
col_btn, col_chk, col_sld = st.columns([1.5, 1, 2], gap="medium")
with col_chk:
use_sampling = st.checkbox("Sample mode", value=True, help="Analyze only a random portion of the text.")
with col_sld:
sample_rate = 1.0
if use_sampling:
sample_rate = st.slider("Sample Size %", 5, 95, 10, 5) / 100
with col_btn:
# Il bottone ora è allineato agli altri widget
launch = st.button("Launch Metaphor Detector", type="secondary", use_container_width=True)
if launch:
with st.spinner("BERT is executing the Masked Language Modeling..."):
lang = st.session_state.get("lang", "en")
res = engine.fig_mod.analyze(st.session_state.nlp_doc,
sample_rate=sample_rate,
lang=lang)
st.session_state.fig_results_data = res
if "fig_results_data" in st.session_state:
res = st.session_state.fig_results_data
st.subheader("MDS indices (Metaphor Density Score)")
c1, c2, c3 = st.columns(3)
c1.metric("MDS-S (per Sentence)", round(res["mds_s"], 4))
st.subheader("MDS indices (Metaphor Density Score)")
c1, c2, c3 = st.columns(3)
c1.metric("MDS-S (per Sentence)", round(res["mds_s"], 4),help="Metaphor Density Score per Sentence: This metric calculates the average number of metaphors per sentence in the text. Metaphor detection scans syntactic pairs (Subj-Verb, Obj-Verb, Noun-Adj) by masking terms and using BERT to predict contextual expectations; it flags a metaphor when the semantic similarity between the original word (neutralized via person/thing placeholders) and BERT's top candidates falls below a 0.90 threshold.")
c2.metric("MDS-W (per 1k Words)", round(res["mds_w"], 2),help="This metric calculates the average number of metaphors per 1,000 words in the text.")
c3.metric("Total Metaphors", res["total"],help="Total number of metaphors detected in the text.")
if res.get("is_sample"):
st.caption(f"⚠️ Note: These scores are estimated based on a {sample_rate*100:.0f}% random sample of the text.")
st.divider()
st.subheader("🔍 Retrieved semantic anomalies")
if not res["detections"]:
st.info("No semantic anomalies detected with the current thresholds.")
else:
for d in res["detections"]:
# Box colorato in base alla probabilità
color = "red" if d['probability'] > 85 else "orange" if d['probability'] > 70 else "blue"
with st.expander(f":{color}[{d['term']} ↔ {d['head']}] - Probabilità: {d['probability']}%"):
st.write(f"**Context:** _{d['sentence']}_")
st.caption(f"Logic: {d['reason']} | s1: {d['s1']} | s2: {d['s2']}")
# --- TAB 6: COMBINED METRICS ---
with t6:
st.subheader("📈 Combined Metrics & Positioning")
if st.session_state.get("lang", "en") == "it":
st.warning(
"⚠️ **Combined Metrics not available for Italian Language.**\n\n"
"The reference database is in English. "
"The Italian module is under developement."
)
else:
if st.session_state.res:
fig_results = st.session_state.get("fig_results_data", None)
if not fig_results:
st.info("💡 **Caution:** the figurative metrics (BERT) have not been calculated. The model will automatically exclude them from comparison with historical texts.")
# Tasto di avvio
if st.button("Calculate Positioning and Quality Score", type="primary"):
with st.spinner("Clustering and Correlation Calculation in Progress..."):
try:
source_name = st.session_state.get("source_name", "CustomText")
# CHIAMATA AL MOTORE MODIFICATO (Estrae 3 variabili)
q_score, fig_clustering, classe_assegnata = engine.run_combined_analysis(
res=st.session_state.res,
fig_res=fig_results,
source_name=source_name,
)
# Salviamo tutto in session_state
st.session_state.tab6_results = (q_score, fig_clustering, classe_assegnata)
except FileNotFoundError:
st.warning("Historical database ('database_completo_largo.pkl') not found.")
except Exception as e:
st.error(f"Error occurred during combined analysis: {e}")
# MOSTRA I RISULTATI
if "tab6_results" in st.session_state:
q_score, fig_clustering, classe_assegnata = st.session_state.tab6_results
# ---> QUI MOSTRIAMO ENTRAMBE LE METRICHE <---
col_score, col_desc = st.columns([1, 2])
with col_score:
# Il punteggio numerico sta benissimo nel widget metric
st.metric("Quality Score", f"{q_score:.3f}")
with col_desc:
if q_score >= 3:
st.success("🌟 Excellent!")
elif q_score >= 2:
st.info("📊 Good: Balanced style, tending towards quality works.")
elif q_score < 2 and q_score >= -15:
st.warning("⚠️ Fair: Common traits with consumer or debut literature.")
elif q_score < 0.5:
st.error("📉 Poor: Stylistically close to amateur or basic works.")
with st.expander("ℹ️ How is this score interpreted?"):
st.markdown("""
The score is calculated by measuring the stylistic correlation (Pearson) of your text against 4 predefined classes of works, penalizing similarity to amateur texts.
* **theoric range**: from **0** to **4**
* **Towards 3.0**: Maximum affinity with Great Classics/Masterpieces and Great Bestsellers.
* **Around 2.0**: Neutral or hybrid style.
* **Towards 0.0**: Maximum affinity with Amateur Works and Basic Genre Literature.
""")
# La classe predetta usa un box informativo che va a capo da solo!
st.info(f"🏷️ **Predicted Class:**\n\n{classe_assegnata}")
st.divider()
st.plotly_chart(fig_clustering, use_container_width=True)
else:
st.info("Perform the basic analysis in Tab 1 first.")
# --- TAB 7: INDICE ILA (Leggibilità) ---
with t7:
st.subheader("📖 Index of Automatic Readability (ILA)")
# 1. Recupero dei dati
lr1 = l.get("r1", 0)
lr2 = l.get("r2", 0)
lr3 = l.get("r3", 0)
punct = s.get("punct_pairs_per_sent", 0)
svo = s.get("svo_inversions_per_sent", 0)
non_comp = s.get("non_comp_sub_per_sent", 0)
# 2. Calcolo della componente Lessicale
somma_lr = lr1 + lr2 + lr3
comp_lessicale = math.log(((lr1 * 2) + lr2 + lr3) / somma_lr if somma_lr > 0 else 0.0)
# 3. Calcolo della penalità Sintattica
comp_sintattica = (punct + svo + non_comp)
# 4. Calcolo ILA finale
ila_score = comp_lessicale - comp_sintattica
# 5. Visualizzazione Metriche Base
col_il1, col_il2, col_il3 = st.columns(3)
col_il1.metric("Lexical Readability", f"{comp_lessicale:.4f}", help="(Log((LR1 * 2) + LR2 + LR3) / Somma LR)")
col_il2.metric("Sytactic Complexity", f"- {comp_sintattica:.4f}", help="Sum of: Punctuation Pairs per Sentence + SVO Inversions per Sentence + Non-completive Subordinate Clauses per Sentence")
col_il3.metric("Final ILA Score", f"{ila_score:.4f}")
st.info("💡 ** How to Interpret the ILA Index:** A higher value indicates greater readability and fluency. The score rewards the use of high-frequency vocabulary (LR1) and penalizes syntactic complexity (inversions, parentheses, and complex subordinate clauses).")
st.divider()
# --- 6. GRAFICO DI COMPARAZIONE ILA ---
st.subheader("📊 Comparison of ILA with Reference Works")
# Dati di riferimento estratti dalla tua tabella
books = ["Ulysses (Part III)
(Hard)", "The Hobbit
(Med. Hard)", "The Little Prince
(Med. Easy)", "Isodora Moon goes to School
(Easy)", "📍 Your Text"]
lex_vals = [0.63, 0.64, 0.65, 0.64, comp_lessicale]
syn_vals = [-2.73, -1.29, -0.87, -0.53, -comp_sintattica] # Negativi per spingerli sotto lo zero visivamente
ila_vals = [-2.1, -0.65, -0.23, 0.11, ila_score]
fig_ila = go.Figure()
# Barre per le componenti
fig_ila.add_trace(go.Bar(name='Lexical Readability (+)', x=books, y=lex_vals, marker_color='#00cc96'))
fig_ila.add_trace(go.Bar(name='Sytactic Complexity (-)', x=books, y=syn_vals, marker_color='#ff4b4b'))
# Linea per il punteggio finale
fig_ila.add_trace(go.Scatter(
name='ILA Score',
x=books, y=ila_vals,
mode='lines+markers+text',
text=[f"{v:.2f}" for v in ila_vals],
textposition="top center",
textfont=dict(color="black", size=12),
marker=dict(color='white', size=10, line=dict(color='black', width=2)),
line=dict(color='black', width=2)
))
# Formattazione del grafico
fig_ila.update_layout(
barmode='relative', # Impila i positivi sopra e i negativi sotto lo zero
height=400,
margin=dict(l=20, r=20, t=40, b=20),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
st.plotly_chart(fig_ila, use_container_width=True)
st.divider()
# ... (qui inizia la sezione degli Altri Indici e dei tachimetri) ...
# --- SEZIONE ALTRI INDICI ---
st.subheader("📊 Comparison of Standard Indices")
# Recupero HD-D e Gunning Fog (già calcolati dal motore)
hdd = l.get("HD-D", 0)
gunning_fog = l.get("gunning_fog", 0)
# Calcolo Gulpease (approssimando le lettere = caratteri - token per escludere gli spazi)
tokens_count = b.get("tokens", 1)
sents_count = b.get("sentences", 1)
chars_count = b.get("chars", 1)
letters_count = chars_count - tokens_count
gulpease = 89 + ((300 * sents_count) - (10 * letters_count)) / tokens_count if tokens_count > 0 else 0
gulpease = max(0, min(100, gulpease)) # Blocca il valore tra 0 e 100
# Creazione dei grafici a Tachimetro con Plotly
# 1. GULPEASE (Più è alto, più è facile)
fig_gulp = go.Figure(go.Indicator(
mode = "gauge+number",
value = gulpease,
title = {'text': "Gulpease
Higher = Easier"},
gauge = {
'axis': {'range': [0, 100]},
'bar': {'color': "rgba(0,0,0,0.5)"},
'steps' : [
{'range': [0, 40], 'color': "#ff4b4b"}, # Difficile (Rosso)
{'range': [40, 60], 'color': "#ffa500"}, # Medio (Arancione)
{'range': [60, 100], 'color': "#00cc96"} # Facile (Verde)
]}
))
# 2. GUNNING FOG (Più è alto, più è difficile)
fig_fog = go.Figure(go.Indicator(
mode = "gauge+number",
value = gunning_fog,
title = {'text': "Gunning Fog
Higher = Harder"},
gauge = {
'axis': {'range': [0, 25]},
'bar': {'color': "rgba(0,0,0,0.5)"},
'steps' : [
{'range': [0, 9], 'color': "#00cc96"}, # Facile (Verde)
{'range': [9, 14], 'color': "#ffa500"}, # Medio (Arancione)
{'range': [14, 25], 'color': "#ff4b4b"} # Difficile (Rosso)
]}
))
# 3. HD-D (Più è alto, più il lessico è ricco)
fig_hdd = go.Figure(go.Indicator(
mode = "gauge+number",
value = hdd,
title = {'text': "Diversity (HD-D)
Higher = More Diverse"},
gauge = {
'axis': {'range': [0, 50]}, # Range tipico per HD-D
'bar': {'color': "rgba(0,0,0,0.5)"},
'steps' : [
{'range': [0, 30], 'color': "#ff4b4b"}, # Ripetitivo (Rosso)
{'range': [30, 40], 'color': "#ffa500"}, # Normale (Arancione)
{'range': [40, 50], 'color': "#00cc96"} # Ricco (Verde)
]}
))
# Riduciamo i margini per farli stare bene in fila
for fig in [fig_gulp, fig_fog, fig_hdd]:
fig.update_layout(height=260, margin=dict(l=20, r=20, t=90, b=20))
# Mostriamo i 3 tachimetri in 3 colonne
cg1, cg2, cg3 = st.columns(3)
with cg1:
st.plotly_chart(fig_gulp, use_container_width=True)
with cg2:
st.plotly_chart(fig_fog, use_container_width=True)
with cg3:
st.plotly_chart(fig_hdd, use_container_width=True)
st.divider()
if st.session_state.res:
# Recupero dati BERT se presenti
fig_results = st.session_state.get("fig_results_data", None)
# Generazione DataFrame
df_export = prepare_export_data(st.session_state.res, fig_results)
base_name = st.session_state.get("source_name", "CustomText")
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
csv_name = f"Co3_{base_name}_{timestamp}.csv"
csv_data = df_export.to_csv(index=False).encode('utf-8')
st.download_button(
label="📥 Export Results in CSV",
data=csv_data,
file_name=csv_name,
mime='text/csv'
)
visite = st.session_state.get("total_visits", 0)
st.caption("Co\u00B3 Suite | {visite}")