Spaces:

kaitongg
/

architutor

Sleeping

App Files Files Community

kaitongg commited on Oct 4, 2025

Commit

420d97e

verified ·

1 Parent(s): 895ca8c

Update app.py

Browse files

Files changed (1) hide show

app.py +341 -104

app.py CHANGED Viewed

@@ -1,60 +1,94 @@
-import os
-import shutil
-import json
 from PIL import Image
 import torch
 import torchvision.transforms as T
-import timm
-import pandas as pd
-import gradio as gr
 import sentence_transformers
-from autogluon.tabular import TabularPredictor
-from huggingface_hub import hf_hub_download, snapshot_download
-from openai import OpenAI
-# ----------------------
-# Load CPU-only image model
-# ----------------------
-REPO_ID_IMAGE = "keerthikoganti/architecture-design-stages-compact-cnn"
-pkl_path = hf_hub_download(repo_id=REPO_ID_IMAGE, filename="model_bundle.pkl")
 with open(pkl_path, "rb") as f:
     bundle = pickle.load(f)
 architecture = bundle["architecture"]
 num_classes = bundle["num_classes"]
 class_names = bundle["class_names"]
 state_dict = bundle["state_dict"]
-device = "cpu"
 model = timm.create_model(architecture, pretrained=False, num_classes=num_classes)
 model.load_state_dict(state_dict)
 model.eval().to(device)
-TFM = T.Compose([T.Resize(224), T.CenterCrop(224), T.ToTensor(), T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])
-# ----------------------
-# Load CPU-only Autogluon predictor
-# ----------------------
-REPO_ID_AG = "kaitongg/my-autogluon-model"
 download_dir = "downloaded_predictor"
 if os.path.exists(download_dir):
     shutil.rmtree(download_dir)
 os.makedirs(download_dir, exist_ok=True)
-downloaded_path = snapshot_download(repo_id=REPO_ID_AG, repo_type="model", local_dir=download_dir, local_dir_use_symlinks=False)
 predictor_path = os.path.join(downloaded_path, "autogluon_predictor")
 loaded_predictor_from_hub = TabularPredictor.load(predictor_path)
-# ----------------------
-# Load sentence transformer
-# ----------------------
 embedding_model = sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# ----------------------
-# Set up Gemini API client
-# ----------------------
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
-gemini_client = OpenAI(api_key=GEMINI_API_KEY)
-# ----------------------
-# LLM attitude mapping
-# ----------------------
 llm_attitude_mapping = {
     "brainstorm": "creative and encouraging",
     "design_iteration": "constructive and detailed, focusing on improvements",
@@ -63,91 +97,294 @@ llm_attitude_mapping = {
     "random": "neutral and informative, perhaps suggesting a relevant stage",
 }
-# ----------------------
-# Functions: Text & Image classification, Prompt generation, LLM
-# ----------------------
-def perform_text_classification_and_format(text: str):
-    if not text:
-        return "No text provided", {}, "0"
-    embeddings = embedding_model.encode([text], convert_to_numpy=True)
-    df_emb = pd.DataFrame(embeddings, columns=[f"e{i}" for i in range(embeddings.shape[1])])
-    proba_df = loaded_predictor_from_hub.predict_proba(df_emb)
-    predicted_label = str(loaded_predictor_from_hub.predict(df_emb).iloc[0])
-    high_concept = "Yes" if predicted_label == "1" else "No"
-    confidence = float(proba_df.iloc[0]["1"] if predicted_label=="1" else proba_df.iloc[0]["0"])
-    formatted_text = f"High Concept: {high_concept} (Confidence: {confidence:.2f})"
-    proba_dict = {"High Concept": float(proba_df.iloc[0]["1"]), "No High Concept": float(proba_df.iloc[0]["0"])}
-    return formatted_text, proba_dict, predicted_label
-def perform_classification_and_format(image: Image.Image, text: str):
-    # Image classification
-    if image is not None:
-        img_tensor = TFM(image).unsqueeze(0).to(device)
-        with torch.no_grad():
-            img_out = model(img_tensor)
-        img_probs = torch.softmax(img_out, dim=1)[0]
-        img_pred_idx = torch.argmax(img_probs).item()
-        design_stage = class_names[img_pred_idx]
-        img_results = {class_names[i]: float(img_probs[i]) for i in range(len(class_names))}
-    else:
-        design_stage = "unknown"
-        img_results = {"error": "No image provided"}
-    # Text classification
-    txt_fmt, txt_probs, predicted_label = perform_text_classification_and_format(text)
-    return img_results, txt_probs, txt_fmt
-def generate_prompt_only(img_results, txt_probs, predicted_label, text):
-    design_stage = max(img_results, key=img_results.get) if img_results and 'error' not in img_results else "unknown"
-    has_high_concept = "Yes" if predicted_label=="1" else "No"
-    confidence = txt_probs.get("High Concept",0.0) if predicted_label=="1" else txt_probs.get("No High Concept",0.0)
     llm_attitude = llm_attitude_mapping.get(design_stage, llm_attitude_mapping["random"])
     prompt = f"""You are an abstract architecture critique interpreter.
 Your audience is a low-level architecture student.
-The user is at the {design_stage} design stage, so your attitude should be {llm_attitude}.
-User input contains high concept: {has_high_concept}.
-Write 250-350 words in English with clear examples and actionable advice, ending with a complete sentence.
-{text}"""
-    return prompt
-def generate_feedback_from_prompt(prompt_input: str):
-    response = gemini_client.chat.completions.create(model="gemini-1.5", messages=[{"role": "user", "content": prompt_input}], max_output_tokens=350, temperature=0.7)
-    return response.choices[0].message.content
-# ----------------------
-# Gradio UI
-# ----------------------
 examples = [
-    ["https://balancedarchitecture.com/wp-content/uploads/2021/11/EXISTING-FIRST-FLOOR-PRES-scaled-e1635965923983.jpg", "Exploring spatial relationships and material palettes."],
-    ["https://cdn.prod.website-files.com/5894a32730554b620f7bf36d/5e848c2d622e7abe1ad48504_5e01ce9f0d272014d0353cd1_Things-You-Need-to-Organize-a-3D-Rendering-Architectural-Project-EASY-RENDER.jpeg", "The window size is too small."],
-    ["https://architectelevator.com/assets/img/bilbao_sketch.png", "The facade expresses the building's relationship with the urban context."],
 ]
-with gr.Blocks() as demo:
-    gr.Markdown("# Architecture Feedback Generator (Step-by-Step)")
-    gr.Markdown("Upload an architectural image and provide a text description or question to see classification results and the generated prompt. Click 'Generate Feedback' to get the LLM's response.")
     with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="pil", label="Upload Image")
-            text_input = gr.Textbox(label="Enter Text", lines=4)
-            classify_btn = gr.Button("Classify & Generate Prompt")
-        with gr.Column():
-            image_out = gr.Label(num_top_classes=len(class_names), label="Image Classification Results")
-            text_out = gr.Textbox(label="Text Classification Results", lines=4)
-            prompt_box = gr.Textbox(label="Generated Prompt (editable)", lines=6, interactive=True)
-            generate_feedback_btn = gr.Button("Generate Feedback")
-        with gr.Column():
-            llm_out = gr.Textbox(label="LLM Feedback", lines=12)
-    classify_btn.click(fn=perform_classification_and_format, inputs=[image_input, text_input], outputs=[image_out, text_out, text_out])
-    generate_feedback_btn.click(fn=lambda p: generate_feedback_from_prompt(p), inputs=[prompt_box], outputs=[llm_out])
-    gr.Examples(examples=examples, inputs=[image_input,text_input], outputs=[image_out,text_out,prompt_box,llm_out], fn=lambda img,txt: (perform_classification_and_format(img,txt)[0], perform_classification_and_format(img,txt)[2], generate_prompt_only(*perform_classification_and_format(img,txt), txt), generate_feedback_from_prompt(generate_prompt_only(*perform_classification_and_format(img,txt), txt))), cache_examples=False)
 if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import pandas as pd
 from PIL import Image
 import torch
 import torchvision.transforms as T
+import os
+import json
 import sentence_transformers
+from huggingface_hub import hf_hub_download
+import pickle
+import timm
+import google.generativeai as genai
+# ============================================
+# 1. LOAD IMAGE CLASSIFICATION MODEL
+# ============================================
+print("Loading image classification model...")
+REPO_ID = "keerthikoganti/architecture-design-stages-compact-cnn"
+pkl_path = hf_hub_download(repo_id=REPO_ID, filename="model_bundle.pkl")
 with open(pkl_path, "rb") as f:
     bundle = pickle.load(f)
 architecture = bundle["architecture"]
 num_classes = bundle["num_classes"]
 class_names = bundle["class_names"]
 state_dict = bundle["state_dict"]
+device = "cuda" if torch.cuda.is_available() else "cpu"
 model = timm.create_model(architecture, pretrained=False, num_classes=num_classes)
 model.load_state_dict(state_dict)
 model.eval().to(device)
+TFM = T.Compose([
+    T.Resize(224),
+    T.CenterCrop(224),
+    T.ToTensor(),
+    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
+])
+print("✓ Image classification model loaded successfully!")
+# ============================================
+# 2. LOAD TEXT CLASSIFICATION MODEL
+# ============================================
+print("Loading text classification model...")
+from autogluon.tabular import TabularPredictor
+import shutil
+text_repo_id = "kaitongg/my-autogluon-model"
 download_dir = "downloaded_predictor"
 if os.path.exists(download_dir):
     shutil.rmtree(download_dir)
 os.makedirs(download_dir, exist_ok=True)
+from huggingface_hub import snapshot_download
+downloaded_path = snapshot_download(
+    repo_id=text_repo_id,
+    repo_type="model",
+    local_dir=download_dir,
+    local_dir_use_symlinks=False,
+)
 predictor_path = os.path.join(downloaded_path, "autogluon_predictor")
 loaded_predictor_from_hub = TabularPredictor.load(predictor_path)
 embedding_model = sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+print("✓ Text classification model loaded successfully!")
+# ============================================
+# 3. INITIALIZE GEMINI API
+# ============================================
+print("Initializing Gemini API...")
+# Get API key from environment variable (set in Hugging Face Spaces secrets)
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+    gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+    print("✓ Gemini API initialized successfully!")
+else:
+    gemini_model = None
+    print("⚠️ Warning: GEMINI_API_KEY not found in environment variables")
+# ============================================
+# 4. LLM ATTITUDE MAPPING
+# ============================================
 llm_attitude_mapping = {
     "brainstorm": "creative and encouraging",
     "design_iteration": "constructive and detailed, focusing on improvements",
     "random": "neutral and informative, perhaps suggesting a relevant stage",
 }
+# ============================================
+# 5. TEXT CLASSIFICATION FUNCTION
+# ============================================
+def perform_text_classification_and_format(text: str) -> tuple:
+    text_classification_formatted = "No text provided"
+    text_classification_probabilities = {"No High Concept": 0.0, "High Concept": 0.0}
+    predicted_text_label = "0"
+    if text and loaded_predictor_from_hub is not None and embedding_model is not None:
+        try:
+            embeddings = embedding_model.encode(
+                [text],
+                batch_size=1,
+                show_progress_bar=False,
+                convert_to_numpy=True,
+                normalize_embeddings=False,
+            )
+            n, d = embeddings.shape
+            text_df_processed = pd.DataFrame(embeddings, columns=[f"e{i}" for i in range(d)])
+            text_proba_df = loaded_predictor_from_hub.predict_proba(text_df_processed)
+            text_classification_probabilities = {
+                "No High Concept": float(text_proba_df.iloc[0].get("0", 0.0)),
+                "High Concept": float(text_proba_df.iloc[0].get("1", 0.0)),
+            }
+            predicted_text_label = str(loaded_predictor_from_hub.predict(text_df_processed).iloc[0])
+            if predicted_text_label == "1":
+                has_high_concept = "Yes"
+                confidence = text_classification_probabilities["High Concept"]
+            else:
+                has_high_concept = "No"
+                confidence = text_classification_probabilities["No High Concept"]
+            text_classification_formatted = f"High Concept: {has_high_concept} (Confidence: {confidence:.2f})"
+        except Exception as e:
+            print(f"Error processing text: {e}")
+            text_classification_formatted = f"Text classification failed: {e}"
+    return text_classification_formatted, text_classification_probabilities, predicted_text_label
+# ============================================
+# 6. COMBINED CLASSIFICATION FUNCTION
+# ============================================
+def perform_classification_and_format(image: Image.Image, text: str) -> tuple:
+    image_classification_results = {"error": "No image provided"}
+    design_stage = "unknown"
+    if image is not None and model is not None:
+        try:
+            img_tensor = TFM(image).unsqueeze(0).to(device)
+            with torch.no_grad():
+                img_output = model(img_tensor)
+            img_probabilities = torch.softmax(img_output, dim=1)[0]
+            predicted_class_index = torch.argmax(img_probabilities).item()
+            design_stage = class_names[predicted_class_index]
+            image_classification_results = {
+                class_names[i]: float(img_probabilities[i])
+                for i in range(len(class_names))
+            }
+            print(f"✓ Image classified as: {design_stage}")
+        except Exception as e:
+            print(f"❌ Error processing image: {e}")
+            image_classification_results = {"error": f"Image classification failed: {e}"}
+    text_classification_formatted, text_classification_probabilities, predicted_text_label = perform_text_classification_and_format(text)
+    return image_classification_results, text_classification_probabilities, text_classification_formatted, predicted_text_label
+# ============================================
+# 7. PROMPT GENERATION FUNCTION
+# ============================================
+def generate_prompt_only(image_classification_results: dict,
+                         text_classification_probabilities: dict,
+                         predicted_text_label: str,
+                         text: str) -> str:
+    design_stage = "unknown"
+    if image_classification_results and "error" not in image_classification_results:
+        try:
+            design_stage = max(image_classification_results, key=image_classification_results.get)
+        except Exception:
+            design_stage = "unknown"
+    has_high_concept = "Unable to determine"
+    confidence = 0.0
+    if text_classification_probabilities and "error" not in text_classification_probabilities:
+        try:
+            if predicted_text_label == "1":
+                has_high_concept = "Yes"
+                confidence = text_classification_probabilities.get("High Concept", 0.0)
+            else:
+                has_high_concept = "No"
+                confidence = text_classification_probabilities.get("No High Concept", 0.0)
+        except Exception:
+            has_high_concept = "Unable to determine"
+            confidence = 0.0
     llm_attitude = llm_attitude_mapping.get(design_stage, llm_attitude_mapping["random"])
     prompt = f"""You are an abstract architecture critique interpreter.
 Your audience is a low-level architecture student.
+The user is in the {design_stage} design stage, so your attitude should be {llm_attitude}.
+The user's input {'contains' if has_high_concept == 'Yes' else 'does not contain'} abstract architectural concepts (confidence: {confidence:.2f}).
+RULES:
+- Write in English, strictly 250-350 words.
+- MUST end with a complete sentence with proper punctuation.
+- Never repeat any viewpoint or sentence.
+- No slogans, catchphrases, or parallel sentence structures.
+- No meta-commentary like "Output complete", "End of response", etc.
+- Stop immediately after the final sentence ends.
+User input: {text}
+Explain abstract concepts using simple, everyday examples that a child could understand, and provide actionable suggestions.
+"""
+    return prompt
+# ============================================
+# 8. GEMINI FEEDBACK GENERATION
+# ============================================
+def generate_feedback_from_prompt(prompt_input: str) -> str:
+    if gemini_model is None:
+        return "⚠️ Gemini API not configured. Please set GEMINI_API_KEY in Hugging Face Spaces secrets."
+    try:
+        print("Generating feedback with Gemini...")
+        generation_config = genai.types.GenerationConfig(
+            temperature=0.7,
+            max_output_tokens=500,
+            top_p=0.9,
+        )
+        response = gemini_model.generate_content(
+            prompt_input,
+            generation_config=generation_config
+        )
+        llm_response_text = response.text.strip()
+        # Post-processing: Remove meta-commentary
+        meta_phrases = [
+            "Output complete", "End of output", "No more text",
+            "Final output", "Response complete", "✅"
+        ]
+        for phrase in meta_phrases:
+            if llm_response_text.endswith(phrase):
+                llm_response_text = llm_response_text[:-len(phrase)].strip()
+        print("✓ Feedback generated successfully")
+        return llm_response_text
+    except Exception as e:
+        print(f"❌ Error during Gemini interaction: {e}")
+        return f"Error generating feedback: {str(e)}"
+# ============================================
+# 9. GRADIO INTERFACE
+# ============================================
 examples = [
+    ["https://balancedarchitecture.com/wp-content/uploads/2021/11/EXISTING-FIRST-FLOOR-PRES-scaled-e1635965923983.jpg",
+     "Exploring spatial relationships and material palettes."],
+    ["https://cdn.prod.website-files.com/5894a32730554b620f7bf36d/5e848c2d622e7abe1ad48504_5e01ce9f0d272014d0353cd1_Things-You-Need-to-Organize-a-3D-Rendering-Architectural-Project-EASY-RENDER.jpeg",
+     "The window size is too small."],
+    ["https://architectelevator.com/assets/img/bilbao_sketch.png",
+     "The facade expresses the building's relationship with the urban context."],
 ]
+with gr.Blocks(css="""
+    .left-column, .middle-column, .right-column {min-width: 300px !important;}
+    .textbox-container textarea {min-height: 150px !important;}
+""") as demo:
+    gr.Markdown("# 🏛️ Architecture Feedback Generator (Powered by Gemini)")
+    gr.Markdown("""
+    Upload an architectural image and provide a text description or question.
+    The system will classify the design stage, analyze the text for high-level concepts,
+    generate a customized prompt, and provide AI-powered feedback using Google's Gemini.
+    """)
     with gr.Row():
+        # LEFT COLUMN - Input Section
+        with gr.Column(scale=1, elem_classes="left-column"):
+            gr.Markdown("### 📥 Input")
+            image_input = gr.Image(type="pil", label="Upload Architectural Image", height=300)
+            text_input = gr.Textbox(
+                label="Enter Text Description or Question",
+                placeholder="Describe your architectural design, ask questions, or provide context...",
+                lines=6,
+                elem_classes="textbox-container"
+            )
+            classify_button = gr.Button("🔍 Classify & Generate Prompt", variant="primary", size="lg")
+        # MIDDLE COLUMN - Classification & Prompt Section
+        with gr.Column(scale=1, elem_classes="middle-column"):
+            gr.Markdown("### 📊 Classification Results & Prompt")
+            image_output_label = gr.Label(
+                num_top_classes=len(class_names),
+                label="Image Classification (Design Stage)"
+            )
+            text_output_textbox = gr.Textbox(
+                label="Text Classification (High Concept Detection)",
+                lines=2,
+                elem_classes="textbox-container"
+            )
+            prompt_output_textbox = gr.Textbox(
+                label="Generated Prompt (Editable)",
+                lines=10,
+                interactive=True,
+                elem_classes="textbox-container"
+            )
+            generate_feedback_button = gr.Button("✨ Generate AI Feedback", variant="primary", size="lg")
+        # RIGHT COLUMN - Gemini Output Section
+        with gr.Column(scale=1, elem_classes="right-column"):
+            gr.Markdown("### 🤖 AI-Generated Feedback")
+            llm_output_text = gr.Textbox(
+                label="Gemini Response",
+                lines=20,
+                elem_classes="textbox-container",
+                show_copy_button=True
+            )
+    # Hidden state variables
+    text_classification_probabilities_state = gr.State()
+    predicted_text_label_state = gr.State()
+    # Step 1: Classification
+    classification_outputs = classify_button.click(
+        fn=perform_classification_and_format,
+        inputs=[image_input, text_input],
+        outputs=[
+            image_output_label,
+            text_classification_probabilities_state,
+            text_output_textbox,
+            predicted_text_label_state
+        ]
+    )
+    # Step 2: Generate Prompt
+    def generate_prompt_wrapper(img_res, txt_prob, predicted_label, txt):
+        return generate_prompt_only(img_res, txt_prob, predicted_label, txt)
+    classification_outputs.then(
+        fn=generate_prompt_wrapper,
+        inputs=[
+            image_output_label,
+            text_classification_probabilities_state,
+            predicted_text_label_state,
+            text_input
+        ],
+        outputs=prompt_output_textbox
+    )
+    # Step 3: Gemini Feedback
+    generate_feedback_button.click(
+        fn=generate_feedback_from_prompt,
+        inputs=[prompt_output_textbox],
+        outputs=llm_output_text
+    )
+    # Examples Section
+    gr.Markdown("---")
+    gr.Markdown("### 💡 Example Inputs")
+    def generate_full_chain_output(img, txt):
+        img_res, txt_prob, txt_fmt, predicted_label = perform_classification_and_format(img, txt)
+        prompt = generate_prompt_only(img_res, txt_prob, predicted_label, txt)
+        llm_res = generate_feedback_from_prompt(prompt)
+        return img_res, txt_fmt, prompt, llm_res
+    gr.Examples(
+        examples=examples,
+        inputs=[image_input, text_input],
+        outputs=[image_output_label, text_output_textbox, prompt_output_textbox, llm_output_text],
+        fn=generate_full_chain_output,
+        cache_examples=False
+    )
 if __name__ == "__main__":
+    demo.launch()