Spaces:

amiguel
/

classfinetune

Sleeping

App Files Files Community

amiguel commited on Feb 10

Commit

104349c

verified ·

1 Parent(s): a1022aa

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -26

app.py CHANGED Viewed

@@ -35,14 +35,28 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 # Model name
 MODEL_NAME = "amiguel/class_insp_program"
-# Label mapping
 LABEL_TO_CLASS = {
-    0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
-    4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
-    8: "Piping", 9: "Pressure Safety Device", 10: "Pressure Vessel (VIE)",
-    11: "Pressure Vessel (VII)", 12: "Structure", 13: "Flame Arrestor"
 }
 # Required columns - UPDATED
 REQUIRED_COLS = ["MaintItem text", "Functional Loc.", "Description"]
@@ -61,6 +75,11 @@ with st.sidebar:
         type=["xlsx", "csv"],
         label_visibility="collapsed"
     )
 # Initialize session state
 if "messages" not in st.session_state:
@@ -105,7 +124,7 @@ def process_file(uploaded_file, _cache_key):
         return None
-# Model loading function
 @st.cache_resource
 def load_model(hf_token):
     if not TRANSFORMERS_AVAILABLE:
@@ -114,41 +133,108 @@ def load_model(hf_token):
         if not hf_token:
             st.error("🔐 Please set the HF_TOKEN environment variable.")
             return None
         login(token=hf_token)
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
-        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(LABEL_TO_CLASS), token=hf_token)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model.to(device)
         return model, tokenizer
     except Exception as e:
         st.error(f"🤖 Model loading failed: {str(e)}")
         return None
-# Classification function
-def classify_instruction(prompt, context, model, tokenizer):
     model.eval()
     device = model.device
     if isinstance(context, pd.DataFrame):
         predictions = []
-        for text in context["input_text"]:
-            full_prompt = f"Context:\n{text}\n\nInstruction: {prompt}"
-            inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
             inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = model(**inputs)
-                prediction = outputs.logits.argmax().item()
-                predictions.append(LABEL_TO_CLASS[prediction])
         return predictions
     else:
-        full_prompt = f"Context:\n{context}\n\nInstruction: {prompt}"
-        inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = model(**inputs)
             prediction = outputs.logits.argmax().item()
-        return LABEL_TO_CLASS[prediction]
 # Excel download function - inserts Item Class before MaintItem text
@@ -162,14 +248,18 @@ def get_excel_download_link(df, filename="predicted_classes.xlsx"):
     cols = list(output_df.columns)
     if "Item Class" in cols:
         cols.remove("Item Class")
     # Find MaintItem text position
     if "MaintItem text" in cols:
         maint_idx = cols.index("MaintItem text")
-        # Insert Item Class before MaintItem text
         cols.insert(maint_idx, "Item Class")
     else:
         # Fallback: put at beginning
         cols.insert(0, "Item Class")
     # Remove input_text column if present (internal use only)
@@ -198,9 +288,12 @@ def get_csv_download_link(df, filename="predicted_classes.csv"):
     cols = list(output_df.columns)
     if "Item Class" in cols and "MaintItem text" in cols:
         cols.remove("Item Class")
         maint_idx = cols.index("MaintItem text")
         cols.insert(maint_idx, "Item Class")
-        output_df = output_df[cols]
     csv = output_df.to_csv(index=False)
     b64 = base64.b64encode(csv.encode()).decode()
@@ -261,15 +354,18 @@ if prompt := st.chat_input("Ask your inspection question..."):
                     file_data = st.session_state.file_data
                     if file_data["type"] == "table":
                         with st.spinner("Classifying..."):
-                            predictions = classify_instruction(prompt, file_data["content"], model, tokenizer)
                         # Add predictions to dataframe
                         result_df = file_data["content"].copy()
                         result_df["Item Class"] = predictions
                         # Display preview (first 10 rows)
                         st.write("**Predicted Item Classes (preview):**")
-                        display_cols = ["Item Class"] + REQUIRED_COLS
                         st.dataframe(result_df[display_cols].head(10), use_container_width=True)
                         # Stats
@@ -277,6 +373,10 @@ if prompt := st.chat_input("Ask your inspection question..."):
                         st.write("**Class distribution:**")
                         st.write(result_df["Item Class"].value_counts())
                         # Download links
                         st.markdown("---")
                         col1, col2 = st.columns(2)
@@ -287,15 +387,21 @@ if prompt := st.chat_input("Ask your inspection question..."):
                         response = f"✅ Classification completed for {len(predictions)} rows."
                     else:
-                        predicted_class = classify_instruction(prompt, file_data["content"], model, tokenizer)
-                        response = f"The Item Class is: **{predicted_class}**"
                 else:
-                    predicted_class = classify_instruction(prompt, "", model, tokenizer)
-                    response = f"The Item Class is: **{predicted_class}**"
                 st.markdown(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
         except Exception as e:
             st.error(f"⚡ Classification error: {str(e)}")
     else:
-        st.error("🤖 Model not loaded!")

 # Model name
 MODEL_NAME = "amiguel/class_insp_program"
+# =============================================================================
+# FIXED: Label mapping must match EXACTLY what the model was trained with
+# The model was trained with 13 classes (Flare Tip and Flare TIP were merged)
+# =============================================================================
 LABEL_TO_CLASS = {
+    0: "Campaign",
+    1: "Corrosion Monitoring",
+    2: "Flare Tip",  # This now covers both "Flare Tip" and "Flare TIP"
+    3: "FU Items",
+    4: "Intelligent Pigging",
+    5: "Lifting",
+    6: "Non Structural Tank",
+    7: "Piping",
+    8: "Pressure Safety Device",
+    9: "Pressure Vessel (VIE)",
+    10: "Pressure Vessel (VII)",
+    11: "Structure",
+    12: "Flame Arrestor"
 }
+NUM_LABELS = len(LABEL_TO_CLASS)  # Should be 13
 # Required columns - UPDATED
 REQUIRED_COLS = ["MaintItem text", "Functional Loc.", "Description"]
         type=["xlsx", "csv"],
         label_visibility="collapsed"
     )
+    # Show model info
+    st.markdown("---")
+    st.markdown(f"**Model:** `{MODEL_NAME}`")
+    st.markdown(f"**Classes:** {NUM_LABELS}")
 # Initialize session state
 if "messages" not in st.session_state:
         return None
+# Model loading function - FIXED
 @st.cache_resource
 def load_model(hf_token):
     if not TRANSFORMERS_AVAILABLE:
         if not hf_token:
             st.error("🔐 Please set the HF_TOKEN environment variable.")
             return None
         login(token=hf_token)
+        # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
+        # =================================================================
+        # FIXED: Load model WITHOUT specifying num_labels
+        # Let it auto-detect from config.json, or use ignore_mismatched_sizes
+        # =================================================================
+        try:
+            # First try: Load without specifying num_labels (uses config.json)
+            model = AutoModelForSequenceClassification.from_pretrained(
+                MODEL_NAME,
+                token=hf_token
+            )
+        except Exception as e1:
+            # Fallback: Try with explicit num_labels and ignore size mismatch
+            st.warning(f"Auto-load failed, trying with explicit config: {str(e1)}")
+            model = AutoModelForSequenceClassification.from_pretrained(
+                MODEL_NAME,
+                num_labels=NUM_LABELS,
+                token=hf_token,
+                ignore_mismatched_sizes=True  # This allows loading even if sizes differ
+            )
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model.to(device)
+        model.eval()
+        # Log successful load
+        st.sidebar.success(f"✅ Model loaded on {device}")
         return model, tokenizer
     except Exception as e:
         st.error(f"🤖 Model loading failed: {str(e)}")
+        import traceback
+        st.error(f"Full traceback:\n```\n{traceback.format_exc()}\n```")
         return None
+# Classification function - IMPROVED with confidence scores
+def classify_instruction(prompt, context, model, tokenizer, return_confidence=False):
     model.eval()
     device = model.device
     if isinstance(context, pd.DataFrame):
         predictions = []
+        confidences = []
+        # Process in batches for efficiency
+        batch_size = 32
+        texts = context["input_text"].tolist()
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i + batch_size]
+            # Prepare inputs
+            inputs = tokenizer(
+                batch_texts,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=128
+            )
             inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = model(**inputs)
+                probs = torch.softmax(outputs.logits, dim=-1)
+                batch_preds = outputs.logits.argmax(dim=-1).cpu().numpy()
+                batch_confs = probs.max(dim=-1).values.cpu().numpy()
+            for pred, conf in zip(batch_preds, batch_confs):
+                # Handle case where prediction ID exceeds our mapping
+                if pred in LABEL_TO_CLASS:
+                    predictions.append(LABEL_TO_CLASS[pred])
+                else:
+                    predictions.append(f"Unknown ({pred})")
+                confidences.append(float(conf))
+        if return_confidence:
+            return predictions, confidences
         return predictions
     else:
+        # Single text classification
+        text = str(context) if context else prompt
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = model(**inputs)
+            probs = torch.softmax(outputs.logits, dim=-1)
             prediction = outputs.logits.argmax().item()
+            confidence = probs[0, prediction].item()
+        pred_label = LABEL_TO_CLASS.get(prediction, f"Unknown ({prediction})")
+        if return_confidence:
+            return pred_label, confidence
+        return pred_label
 # Excel download function - inserts Item Class before MaintItem text
     cols = list(output_df.columns)
     if "Item Class" in cols:
         cols.remove("Item Class")
+    if "Confidence" in cols:
+        cols.remove("Confidence")
     # Find MaintItem text position
     if "MaintItem text" in cols:
         maint_idx = cols.index("MaintItem text")
+        # Insert Item Class and Confidence before MaintItem text
+        cols.insert(maint_idx, "Confidence")
         cols.insert(maint_idx, "Item Class")
     else:
         # Fallback: put at beginning
+        cols.insert(0, "Confidence")
         cols.insert(0, "Item Class")
     # Remove input_text column if present (internal use only)
     cols = list(output_df.columns)
     if "Item Class" in cols and "MaintItem text" in cols:
         cols.remove("Item Class")
+        if "Confidence" in cols:
+            cols.remove("Confidence")
         maint_idx = cols.index("MaintItem text")
+        cols.insert(maint_idx, "Confidence")
         cols.insert(maint_idx, "Item Class")
+        output_df = output_df[[c for c in cols if c in output_df.columns]]
     csv = output_df.to_csv(index=False)
     b64 = base64.b64encode(csv.encode()).decode()
                     file_data = st.session_state.file_data
                     if file_data["type"] == "table":
                         with st.spinner("Classifying..."):
+                            predictions, confidences = classify_instruction(
+                                prompt, file_data["content"], model, tokenizer, return_confidence=True
+                            )
                         # Add predictions to dataframe
                         result_df = file_data["content"].copy()
                         result_df["Item Class"] = predictions
+                        result_df["Confidence"] = [f"{c:.2%}" for c in confidences]
                         # Display preview (first 10 rows)
                         st.write("**Predicted Item Classes (preview):**")
+                        display_cols = ["Item Class", "Confidence"] + REQUIRED_COLS
                         st.dataframe(result_df[display_cols].head(10), use_container_width=True)
                         # Stats
                         st.write("**Class distribution:**")
                         st.write(result_df["Item Class"].value_counts())
+                        # Average confidence
+                        avg_conf = sum(confidences) / len(confidences)
+                        st.write(f"**Average confidence:** {avg_conf:.2%}")
                         # Download links
                         st.markdown("---")
                         col1, col2 = st.columns(2)
                         response = f"✅ Classification completed for {len(predictions)} rows."
                     else:
+                        predicted_class, confidence = classify_instruction(
+                            prompt, file_data["content"], model, tokenizer, return_confidence=True
+                        )
+                        response = f"The Item Class is: **{predicted_class}** (confidence: {confidence:.2%})"
                 else:
+                    predicted_class, confidence = classify_instruction(
+                        prompt, "", model, tokenizer, return_confidence=True
+                    )
+                    response = f"The Item Class is: **{predicted_class}** (confidence: {confidence:.2%})"
                 st.markdown(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
         except Exception as e:
             st.error(f"⚡ Classification error: {str(e)}")
+            import traceback
+            st.error(f"```\n{traceback.format_exc()}\n```")
     else:
+        st.error("🤖 Model not loaded!")