import torch from fastapi import FastAPI from fastapi.responses import RedirectResponse from fastapi.staticfiles import StaticFiles from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM import uvicorn import os import sqlite3 from typing import List, Optional app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) app.mount("/static", StaticFiles(directory="static"), name="static") @app.get("/") def read_root(): return RedirectResponse(url="/static/index.html") # --- SQLite Database Setup --- # Hugging Face Spaces make the /app directory read-only by default. # We must write the database to /data (if persistent storage is enabled) or /tmp. DB_DIR = "/data" if os.path.exists("/data") else "/tmp" DB_FILE = os.path.join(DB_DIR, "study.db") def init_db(): conn = sqlite3.connect(DB_FILE) c = conn.cursor() c.execute(""" CREATE TABLE IF NOT EXISTS study_results ( id INTEGER PRIMARY KEY AUTOINCREMENT, user_id TEXT, text_id INTEGER, condition TEXT, -- "plain" or "flowread" or "gradient" reading_time_ms INTEGER, score INTEGER, total_questions INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) c.execute(""" CREATE TABLE IF NOT EXISTS study_preferences ( id INTEGER PRIMARY KEY AUTOINCREMENT, user_id TEXT, preference TEXT, -- "plain", "flowread", or "gradient" created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) conn.commit() conn.close() init_db() # --- Study Content --- STUDY_TEXTS = [ { "id": 1, "topic": "Science", "text": "The human brain is a marvel of biological engineering, containing approximately 86 billion neurons interconnected by trillions of synapses. These neural networks are responsible for everything from basic autonomic functions, like breathing and heart rate, to complex cognitive processes such as memory, emotion, and problem-solving. Neuroplasticity, the brain's ability to reorganize itself by forming new neural connections throughout life, allows humans to learn new skills, recover from injuries, and adapt to changing environments. This extraordinary adaptability is what makes our species so resilient and capable of continuous intellectual growth.", "questions": [ { "question": "Approximately how many neurons are in the human brain?", "options": ["86 million", "86 billion", "100 trillion", "50 billion"], "correct": 1, }, { "question": "What is the term for the brain's ability to reorganize itself?", "options": [ "Synaptic generation", "Neurogenesis", "Neuroplasticity", "Cognitive adaptation", ], "correct": 2, }, ], "flowread_html": 'The human brain is a marvel of biological engineering, containing approximately 86 billion neurons interconnected by trillions of synapses. These neural networks are responsible for everything from basic autonomic functions, like breathing and heart rate, to complex cognitive processes such as memory, emotion, and problem-solving. Neuroplasticity, the brain\'s ability to reorganize itself by forming new neural connections throughout life, allows humans to learn new skills, recover from injuries, and adapt to changing environments. This extraordinary adaptability is what makes our species so resilient and capable of continuous intellectual growth.', "flowread_gradient_html": 'The human brain is a marvel of biological engineering, containing approximately 86 billion neurons interconnected by trillions of synapses. These neural networks are responsible for everything from basic autonomic functions, like breathing and heart rate, to complex cognitive processes such as memory, emotion, and problem-solving. Neuroplasticity, the brain\'s ability to reorganize itself by forming new neural connections throughout life, allows humans to learn new skills, recover from injuries, and adapt to changing environments. This extraordinary adaptability is what makes our species so resilient and capable of continuous intellectual growth.', }, { "id": 2, "topic": "History", "text": "The Industrial Revolution, which began in Britain in the late 18th century, marked a profound turning point in human history. It initiated the transition from agrarian, handicraft economies to industry and machine manufacturing. The invention of the steam engine, pioneered by figures like James Watt, dramatically increased the efficiency of factories and transportation, revolutionizing the textile industry and leading to the expansion of railways. This era brought about unprecedented economic growth and urbanization, fundamentally altering social structures and paving the way for the modern capitalist system, despite also causing significant social inequalities and poor working conditions initially.", "questions": [ { "question": "Where did the Industrial Revolution begin?", "options": ["United States", "France", "Germany", "Britain"], "correct": 3, }, { "question": "Which invention dramatically increased factory efficiency?", "options": [ "The cotton gin", "The telegraph", "The steam engine", "The assembly line", ], "correct": 2, }, ], "flowread_html": 'The Industrial Revolution, which began in Britain in the late 18th century, marked a profound turning point in human history. It initiated the transition from agrarian, handicraft economies to industry and machine manufacturing. The invention of the steam engine, pioneered by figures like James Watt, dramatically increased the efficiency of factories and transportation, revolutionizing the textile industry and leading to the expansion of railways. This era brought about unprecedented economic growth and urbanization, fundamentally altering social structures and paving the way for the modern capitalist system, despite also causing significant social inequalities and poor working conditions initially.', "flowread_gradient_html": 'The Industrial Revolution, which began in Britain in the late 18th century, marked a profound turning point in human history. It initiated the transition from agrarian, handicraft economies to industry and machine manufacturing. The invention of the steam engine, pioneered by figures like James Watt, dramatically increased the efficiency of factories and transportation, revolutionizing the textile industry and leading to the expansion of railways. This era brought about unprecedented economic growth and urbanization, fundamentally altering social structures and paving the way for the modern capitalist system, despite also causing significant social inequalities and poor working conditions initially.', }, { "id": 3, "topic": "Technology", "text": "The James Webb Space Telescope is the largest and most powerful space telescope ever built. Launched in 2021, it operates primarily in the infrared spectrum, allowing it to peer through dense cosmic dust and observe the universe's most distant, early galaxies. Unlike its predecessor, Hubble, JWST orbits the Sun at the second Lagrange point, keeping it constantly shielded from the Sun's heat and light by its massive sunshield. This incredibly cold environment is necessary to prevent the telescope's own infrared emissions from interfering with its highly sensitive observations of exoplanet atmospheres and star formation.", "questions": [ { "question": "What spectrum does the James Webb Space Telescope primarily operate in?", "options": ["Ultraviolet", "X-ray", "Infrared", "Visible light"], "correct": 2, }, { "question": "Where does the telescope orbit to stay cold?", "options": [ "Low Earth Orbit", "The Moon's orbit", "The first Lagrange point", "The second Lagrange point", ], "correct": 3, }, ], "flowread_html": 'The James Webb Space Telescope is the largest and most powerful space telescope ever built. Launched in 2021, it operates primarily in the infrared spectrum, allowing it to peer through dense cosmic dust and observe the universe\'s most distant, early galaxies. Unlike its predecessor, Hubble, JWST orbits the Sun at the second Lagrange point, keeping it constantly shielded from the Sun\'s heat and light by its massive sunshield. This incredibly cold environment is necessary to prevent the telescope\'s own infrared emissions from interfering with its highly sensitive observations of exoplanet atmospheres and star formation.', "flowread_gradient_html": 'The James Webb Space Telescope is the largest and most powerful space telescope ever built. Launched in 2021, it operates primarily in the infrared spectrum, allowing it to peer through dense cosmic dust and observe the universe\'s most distant, early galaxies. Unlike its predecessor, Hubble, JWST orbits the Sun at the second Lagrange point, keeping it constantly shielded from the Sun\'s heat and light by its massive sunshield. This incredibly cold environment is necessary to prevent the telescope\'s own infrared emissions from interfering with its highly sensitive observations of exoplanet atmospheres and star formation.', }, ] # --- Study API Endpoints --- @app.get("/api/study/texts") def get_study_texts(): return {"texts": STUDY_TEXTS} class StudySubmission(BaseModel): user_id: str text_id: int condition: str reading_time_ms: int score: int total_questions: int class StudyPreference(BaseModel): user_id: str preference: str @app.post("/api/study/preference") def submit_study_preference(submission: StudyPreference): conn = sqlite3.connect(DB_FILE) c = conn.cursor() c.execute( "INSERT INTO study_preferences (user_id, preference) VALUES (?, ?)", (submission.user_id, submission.preference), ) conn.commit() conn.close() return {"status": "success"} @app.post("/api/study/submit") def submit_study_result(submission: StudySubmission): conn = sqlite3.connect(DB_FILE) c = conn.cursor() c.execute( "INSERT INTO study_results (user_id, text_id, condition, reading_time_ms, score, total_questions) VALUES (?, ?, ?, ?, ?, ?)", ( submission.user_id, submission.text_id, submission.condition, submission.reading_time_ms, submission.score, submission.total_questions, ), ) conn.commit() conn.close() return {"status": "success"} @app.get("/api/study/stats") def get_study_stats(): conn = sqlite3.connect(DB_FILE) c = conn.cursor() # Calculate stats for plain c.execute( "SELECT AVG(reading_time_ms), AVG(CAST(score AS FLOAT) / total_questions) * 100, COUNT(*) FROM study_results WHERE condition = 'plain'" ) plain_stats = c.fetchone() # Calculate stats for flowread c.execute( "SELECT AVG(reading_time_ms), AVG(CAST(score AS FLOAT) / total_questions) * 100, COUNT(*) FROM study_results WHERE condition = 'flowread'" ) flowread_stats = c.fetchone() # Calculate stats for gradient c.execute( "SELECT AVG(reading_time_ms), AVG(CAST(score AS FLOAT) / total_questions) * 100, COUNT(*) FROM study_results WHERE condition = 'gradient'" ) gradient_stats = c.fetchone() # Calculate preferences c.execute("SELECT preference, COUNT(*) FROM study_preferences GROUP BY preference") preferences = dict(c.fetchall()) conn.close() return { "plain": { "avg_reading_time_ms": plain_stats[0] or 0, "avg_accuracy_percent": plain_stats[1] or 0, "sample_size": plain_stats[2], }, "flowread": { "avg_reading_time_ms": flowread_stats[0] or 0, "avg_accuracy_percent": flowread_stats[1] or 0, "sample_size": flowread_stats[2], }, "gradient": { "avg_reading_time_ms": gradient_stats[0] or 0, "avg_accuracy_percent": gradient_stats[1] or 0, "sample_size": gradient_stats[2], }, "preferences": preferences, } import sys import re class StderrProgressInterceptor: def __init__(self, original): self.original = original self.current_progress = "" self.active_model = None def write(self, s): self.original.write(s) match = re.search(r"(\d+)%\|", s) if match and self.active_model: pct = match.group(1) self.current_progress = f"{pct}%" # Update the global status explicitly so the API returns it immediately model_status[self.active_model] = f"downloading: {self.current_progress}" def flush(self): self.original.flush() stderr_interceptor = StderrProgressInterceptor(sys.stderr) sys.stderr = stderr_interceptor # --- Saliency API (Existing) --- models = {} tokenizers = {} model_status = {"2b": "unloaded", "27b-4a": "unloaded"} device = torch.device( "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) hf_token = os.environ.get("HF_TOKEN") def load_model(model_name: str): if model_name in models: return models[model_name], tokenizers[model_name] print(f"Loading {model_name} on {device}...") model_status[model_name] = "downloading: 0%" stderr_interceptor.active_model = model_name try: if model_name == "27b-4a": # Use Gemma 4 26B A4B in 4-bit (requires CUDA) hf_model_id = "unsloth/gemma-4-26B-A4B-bnb-4bit" try: tokenizer = AutoTokenizer.from_pretrained( hf_model_id, token=hf_token ) except Exception: hf_model_id = "unsloth/gemma-4-26B-A4B" tokenizer = AutoTokenizer.from_pretrained( hf_model_id, token=hf_token ) # BitsAndBytes for 4-bit quantization from transformers import BitsAndBytesConfig bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained( hf_model_id, quantization_config=bnb_config, device_map="auto", attn_implementation="eager", token=hf_token, ) else: # Default to 2b (Gemma 4 E2B) hf_model_id = "unsloth/gemma-4-E2B" tokenizer = AutoTokenizer.from_pretrained( hf_model_id, token=hf_token, extra_special_tokens={} ) model = AutoModelForCausalLM.from_pretrained( hf_model_id, torch_dtype=torch.bfloat16, attn_implementation="eager", token=hf_token, ).to(device) print(f"Model {model_name} loaded successfully.") models[model_name] = model tokenizers[model_name] = tokenizer model_status[model_name] = "loaded" stderr_interceptor.active_model = None return model, tokenizer except Exception as e: print(f"Error loading model {model_name}: {e}") model_status[model_name] = "error" stderr_interceptor.active_model = None raise e # Pre-load default 2b try: load_model("2b") except: print("Could not preload 2b model.") @app.get("/status") def get_model_status(): return model_status class TextRequest(BaseModel): text: str layers: Optional[List[int]] = None # Optional explicit layer selection layer_preset: str = "middle" # "first", "middle", "last", "all" preprompt: str = "" # Optional task-driven intent saliency_mode: str = "local" # "local" or "global" @app.post("/analyze") def analyze_text_legacy(request: TextRequest): return analyze_text_model("2b", request) @app.post("/analyze/{model_name}") def analyze_text_model(model_name: str, request: TextRequest): text = request.text preprompt = request.preprompt.strip() if not text.strip(): return {"tokens": [], "scores": []} try: model, tokenizer = load_model(model_name) except Exception as e: from fastapi import HTTPException raise HTTPException(status_code=500, detail=str(e)) # Combine preprompt and text if preprompt exists full_text = f"{preprompt}\n\n{text}" if preprompt else text inputs = tokenizer(full_text, return_tensors="pt").to(model.device) # Calculate how many tokens belong to the preprompt so we can strip them later num_preprompt_tokens = 0 if preprompt: p_toks = tokenizer(f"{preprompt}\n\n")["input_ids"] num_preprompt_tokens = len(p_toks) elif ( tokenizer.bos_token_id is not None and len(inputs["input_ids"][0]) > 0 and inputs["input_ids"][0][0] == tokenizer.bos_token_id ): num_preprompt_tokens = 1 # Just the bos token with torch.no_grad(): # Ensure we ask the model to output attentions explicitly outputs = model(**inputs, output_attentions=True) # Check if attentions are actually returned if not outputs.attentions: print("Warning: Model did not return attentions.") return {"words": []} num_layers = len(outputs.attentions) selected_layers = request.layers if not selected_layers: preset = request.layer_preset if preset == "all": selected_layers = list(range(num_layers)) elif preset == "first": selected_layers = list(range(0, max(1, num_layers // 4))) elif preset == "last": selected_layers = list(range(num_layers - (num_layers // 4), num_layers)) else: # default to "middle" start_layer = num_layers // 4 end_layer = num_layers - (num_layers // 4) selected_layers = list(range(start_layer, end_layer)) selected_layers = [l for l in selected_layers if 0 <= l < num_layers] if not selected_layers: selected_layers = [num_layers - 1] stacked_attentions = torch.stack([outputs.attentions[l] for l in selected_layers]) avg_attention = stacked_attentions.mean(dim=(0, 2))[0] # Calculate importance: sum of attention each token *receives* from the sequence importance = avg_attention.sum(dim=0).cpu().float().numpy() import numpy as np if len(importance) > num_preprompt_tokens: text_importance = importance[num_preprompt_tokens:] if request.saliency_mode == "global": # Global Mode: absolute importance across different texts # We apply a soft root penalty so very high values don't entirely blow out the scale, # but high-density blocks will still look visibly darker/bolder than simple blocks. # An importance sum of 1.0 (average) maps to ~0.46, an importance of 3.0+ maps to 1.0 normalized_scores = np.clip((importance / 3.0) ** 0.7, 0, 1.0) else: # Local Mode: relative importance within this specific block of text min_score = text_importance.min() max_score = text_importance.max() if max_score > min_score: normalized_scores = (importance - min_score) / (max_score - min_score) else: normalized_scores = importance - min_score # Keep at max score normalized_scores[0] = 1.0 normalized_scores = normalized_scores.clip(0, 1) else: normalized_scores = [1.0] * len(importance) input_ids = inputs["input_ids"][0].tolist() tokens = tokenizer.convert_ids_to_tokens(input_ids) # Check if first token is BOS has_bos = (input_ids[0] == tokenizer.bos_token_id) if len(input_ids) > 0 else False result = [] for i, t in enumerate(tokens): # Decode properly word = tokenizer.decode([input_ids[i]]) # Byte fallback handling: if the token is a byte fallback, decode the byte if t.startswith("<0x") and t.endswith(">"): # Just ignore printing these raw bytes out since they render ugly # We'll rely on word or fallback to empty word = "" t = "" # Special check for Gemma, decoding often removes spaces incorrectly or leaves tokens empty # Let's clean the raw token just in case raw_clean = t.replace("\u2581", " ") # We will pass both decoded word and raw cleaned token to frontend to help render result.append( {"token": raw_clean, "word": word, "score": float(normalized_scores[i])} ) # Return the actual text tokens. Keep if it exists. if num_preprompt_tokens > 0 and len(result) > num_preprompt_tokens: if has_bos: result = [result[0]] + result[num_preprompt_tokens:] else: result = result[num_preprompt_tokens:] return {"words": result} if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) uvicorn.run("main:app", host="0.0.0.0", port=port, reload=True)