Spaces:

MrSimple01
/

QuizGenerator

Sleeping

App Files Files Community

MrSimple01 commited on Apr 8, 2025

Commit

f3c5ac6

verified ·

1 Parent(s): 2876d5f

Update app.py

Browse files

Files changed (1) hide show

app.py +233 -201

app.py CHANGED Viewed

@@ -1,16 +1,56 @@
 import re
-import numpy as np
 import json
-from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer
-from langchain_google_genai import ChatGoogleGenerativeAI
-import os
-import gradio as gr
 import time
-tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
-sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
 def clean_text(text):
     text = re.sub(r'\[speaker_\d+\]', '', text)
@@ -43,243 +83,235 @@ def split_text_by_tokens(text, max_tokens=8000):
     return [" ".join(first_half), " ".join(second_half)]
-def analyze_segment_with_gemini(segment_text):
-    llm = ChatGoogleGenerativeAI(
-        model="gemini-1.5-flash",
-        temperature=0.7,
-        max_tokens=None,
-        timeout=None,
-        max_retries=3
-    )
-    prompt = f"""
-        Analyze the following text and identify distinct segments within it and do text segmentation:
-        1. Segments should be STRICTLY max=15
-        2. For each segment/topic you identify:
-           - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
-           - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
-           - Write a brief summary of that segment (3-5 sentences)
-           - Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only
-           - Questions and answers should be only from the content of the segment
-        For each quiz question:
-        - Create one correct answer that comes DIRECTLY from the text
-        - Create two plausible but incorrect answers
-        - IMPORTANT: Ensure all answer options have similar length (± 3 words)
-        - Ensure the correct answer is clearly indicated with a ✓ symbol
-       - Questions should **require actual understanding**, NOT just basic fact recall.
-        - Questions Are **non-trivial**, encourage deeper thinking, and **avoid surface-level facts**.
-        - Are **directly based on the segment's content** (not inferred from the summary).
-        - Do **not include questions about document structure** (e.g., title, number of paragraphs).
-        - Do **not generate overly generic or obvious questions** (e.g., "What is mentioned in the text?").
-        - Focus on **core ideas, logical reasoning, and conceptual understanding**.
-    ADDITIONAL REQUIREMENT:
-    - **First, detect the language of the original text.**
-    - **Generate ALL output (topic names, key concepts, summaries, and quizzes) in the same language as the original text.**
-    - If the text is in Russian, generate all responses in Russian.
-    - If the text is in another language, generate responses in that original language.
-        Text:
-        {segment_text}
-        Format your response as JSON with the following structure:
-        {{
-            "segments": [
-                {{
-                    "topic_name": "Unique and Specific Topic Name",
-                    "key_concepts": ["concept1", "concept2", "concept3"],
-                    "summary": "Brief summary of this segment.",
-                    "quiz_questions": [
-                        {{
-                            "question": "Question text?",
-                            "options": [
-                                {{
-                                    "text": "Option A",
-                                    "correct": false
-                                }},
-                                {{
-                                    "text": "Option B",
-                                    "correct": true
-                                }},
-                                {{
-                                    "text": "Option C",
-                                    "correct": false
-                                }}
-                            ]
-                        }}
-                    ]
-                }}
-            ]
-        }}
-        IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
-    - **Do NOT repeat** key concepts across multiple segments unless absolutely necessary.
-    - **Ensure the quiz questions challenge the reader** and **are not easily guessable**.
-    """
-    response = llm.invoke(prompt)
-    response_text = response.content
-    try:
-        json_match = re.search(r'\{[\s\S]*\}', response_text)
-        if json_match:
-            return json.loads(json_match.group(0))
-        else:
-            return json.loads(response_text)
-    except json.JSONDecodeError:
-        return {
-            "segments": [
-                {
-                    "topic_name": "JSON Parsing Error",
-                    "key_concepts": ["Error in response format"],
-                    "summary": "Could not parse the API response.",
-                    "quiz_questions": []
-                }
-            ]
         }
-def process_document_with_quiz(text):
-    start_time = time.time()
-    token_count = len(tokenizer.encode(text))
-    print(f"[LOG] Total document tokens: {token_count}")
-    if token_count > 8000:
-        print(f"[LOG] Document exceeds 8000 tokens. Splitting into parts.")
-        parts = split_text_by_tokens(text)
-        print(f"[LOG] Document split into {len(parts)} parts")
-        for i, part in enumerate(parts):
-            part_tokens = len(tokenizer.encode(part))
-            print(f"[LOG] Part {i+1} contains {part_tokens} tokens")
-    else:
-        print(f"[LOG] Document under 8000 tokens. Processing as a single part.")
-        parts = [text]
-    all_segments = []
-    segment_counter = 1
-    for i, part in enumerate(parts):
-        part_start_time = time.time()
-        print(f"[LOG] Processing part {i+1}...")
-        analysis = analyze_segment_with_gemini(part)
-        if "segments" in analysis:
-            print(f"[LOG] Found {len(analysis['segments'])} segments in part {i+1}")
-            for segment in analysis["segments"]:
-                segment["segment_number"] = segment_counter
-                all_segments.append(segment)
-                print(f"[LOG] Segment {segment_counter}: {segment['topic_name']}")
-                segment_counter += 1
-        else:
-            # Fallback if response format is unexpected
-            print(f"[LOG] Error: Unexpected format in part {i+1} analysis")
-            fallback_segment = {
-                "topic_name": f"Segment {segment_counter} Analysis",
-                "key_concepts": ["Format error in analysis"],
-                "summary": "Could not properly segment this part of the text.",
-                "quiz_questions": [],
-                "segment_number": segment_counter
-            }
-            all_segments.append(fallback_segment)
-            print(f"[LOG] Added fallback segment {segment_counter}")
-            segment_counter += 1
-        part_time = time.time() - part_start_time
-        print(f"[LOG] Part {i+1} processed in {part_time:.2f} seconds")
-    total_time = time.time() - start_time
-    print(f"[LOG] Total processing time: {total_time:.2f} seconds")
-    print(f"[LOG] Generated {len(all_segments)} segments total")
-    return all_segments
 def format_quiz_for_display(results):
     output = []
-    for segment in results:
         topic = segment["topic_name"]
-        segment_num = segment["segment_number"]
         output.append(f"\n\n{'='*40}")
         output.append(f"SEGMENT {segment_num}: {topic}")
         output.append(f"{'='*40}\n")
         output.append("KEY CONCEPTS:")
         for concept in segment["key_concepts"]:
             output.append(f"• {concept}")
         output.append("\nSUMMARY:")
         output.append(segment["summary"])
         output.append("\nQUIZ QUESTIONS:")
         for i, q in enumerate(segment["quiz_questions"]):
             output.append(f"\n{i+1}. {q['question']}")
             for j, option in enumerate(q['options']):
-                letter = chr(97 + j).upper()
                 correct_marker = " ✓" if option["correct"] else ""
                 output.append(f"   {letter}. {option['text']}{correct_marker}")
     return "\n".join(output)
-def save_results_as_json(results, filename="analysis_results.json"):
-    with open(filename, "w", encoding="utf-8") as f:
-        json.dump(results, f, indent=2, ensure_ascii=False)
-    return filename
-def save_results_as_txt(formatted_text, filename="analysis_results.txt"):
-    with open(filename, "w", encoding="utf-8") as f:
-        f.write(formatted_text)
-    return filename
-def analyze_document(document_text, api_key):
-    print(f"[LOG] Starting document analysis...")
-    overall_start_time = time.time()
-    os.environ["GOOGLE_API_KEY"] = api_key
     try:
-        results = process_document_with_quiz(document_text)
-        formatted_output = format_quiz_for_display(results)
-        json_path = "analysis_results.json"
-        txt_path = "analysis_results.txt"
-        with open(json_path, "w", encoding="utf-8") as f:
-            json.dump(results, f, indent=2, ensure_ascii=False)
-        with open(txt_path, "w", encoding="utf-8") as f:
-            f.write(formatted_output)
-        overall_time = time.time() - overall_start_time
-        print(f"[LOG] Document analysis completed in {overall_time:.2f} seconds")
-        topics_summary = "DOCUMENT ANALYSIS SUMMARY:\n"
-        topics_summary += f"Total segments: {len(results)}\n"
-        topics_summary += f"Processing time: {overall_time:.2f} seconds\n\n"
-        topics_summary += "SEGMENTS:\n"
-        for segment in results:
-            topics_summary += f"- Segment {segment['segment_number']}: {segment['topic_name']}\n"
-        formatted_output = topics_summary + "\n" + formatted_output
-        return formatted_output, json_path, txt_path
     except Exception as e:
-        error_msg = f"Error processing document: {str(e)}"
-        print(f"[LOG] ERROR: {error_msg}")
-        return error_msg, None, None
 with gr.Blocks(title="Quiz Generator") as app:
     gr.Markdown("# Quiz Generator")
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(
@@ -289,8 +321,8 @@ with gr.Blocks(title="Quiz Generator") as app:
             )
             api_key = gr.Textbox(
-                label="Gemini API Key",
-                placeholder="Enter your Gemini API key",
                 type="password"
             )
@@ -306,9 +338,9 @@ with gr.Blocks(title="Quiz Generator") as app:
     analyze_btn.click(
         fn=analyze_document,
-        inputs=[input_text, api_key],
         outputs=[output_results, json_file_output, txt_file_output]
     )
 if __name__ == "__main__":
     app.launch()

+import os
 import re
 import json
 import time
+import gradio as gr
+import tempfile
+from typing import Dict, Any, List, Optional
+from transformers import AutoTokenizer
+from sentence_transformers import SentenceTransformer
+from pydantic import BaseModel, Field
+from anthropic import Anthropic
+CLAUDE_MODEL = "claude-3-5-sonnet-20241022"
+OPENAI_MODEL = "gpt-4o"
+GEMINI_MODEL = "gemini-2.0-flash"
+DEFAULT_TEMPERATURE = 0.7
+TOKENIZER_MODEL = "answerdotai/ModernBERT-base"
+SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
+class CourseInfo(BaseModel):
+    course_name: str = Field(description="Name of the course")
+    section_name: str = Field(description="Name of the course section")
+    lesson_name: str = Field(description="Name of the lesson")
+class QuizOption(BaseModel):
+    text: str = Field(description="The text of the answer option")
+    correct: bool = Field(description="Whether this option is correct")
+class QuizQuestion(BaseModel):
+    question: str = Field(description="The text of the quiz question")
+    options: List[QuizOption] = Field(description="List of answer options")
+class Segment(BaseModel):
+    segment_number: int = Field(description="The segment number")
+    topic_name: str = Field(description="Unique and specific topic name that clearly differentiates it from other segments")
+    key_concepts: List[str] = Field(description="3-5 key concepts discussed in the segment")
+    summary: str = Field(description="Brief summary of the segment (3-5 sentences)")
+    quiz_questions: List[QuizQuestion] = Field(description="5 quiz questions based on the segment content")
+class TextSegmentAnalysis(BaseModel):
+    course_info: CourseInfo = Field(description="Information about the course")
+    segments: List[Segment] = Field(description="List of text segments with analysis")
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
+sentence_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
+# System prompt
+system_prompt = """You are an expert educational content analyzer. Your task is to analyze text content,
+identify distinct segments, and create high-quality educational quiz questions for each segment."""
 def clean_text(text):
     text = re.sub(r'\[speaker_\d+\]', '', text)
     return [" ".join(first_half), " ".join(second_half)]
+def generate_with_claude(text, api_key, course_name="", section_name="", lesson_name=""):
+    from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_CLAUDE
+    client = Anthropic(api_key=api_key)
+    segment_analysis_schema = TextSegmentAnalysis.model_json_schema()
+    tools = [
+        {
+            "name": "build_segment_analysis",
+            "description": "Build the text segment analysis with quiz questions",
+            "input_schema": segment_analysis_schema
         }
+    ]
+    system_prompt = """You are a helpful assistant specialized in text analysis and educational content creation.
+    You analyze texts to identify distinct segments, create summaries, and generate quiz questions."""
+    prompt =     prompt = ANALYSIS_PROMPT_TEMPLATE_CLAUDE.format(
+        course_name=course_name,
+        section_name=section_name,
+        lesson_name=lesson_name,
+        text=text
+    )
+    try:
+        response = client.messages.create(
+            model=CLAUDE_MODEL,
+            max_tokens=8192,
+            temperature=DEFAULT_TEMPERATURE,
+            system=system_prompt,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            tools=tools,
+            tool_choice={"type": "tool", "name": "build_segment_analysis"}
+        )
+        # Extract the tool call content
+        if response.content and len(response.content) > 0 and hasattr(response.content[0], 'input'):
+            function_call = response.content[0].input
+            return function_call
+        else:
+            raise Exception("No valid tool call found in the response")
+    except Exception as e:
+        raise Exception(f"Error calling Anthropic API: {str(e)}")
+def get_llm_by_api_key(api_key):
+    if api_key.startswith("sk-ant-"):  # Claude API key format
+        from langchain_anthropic import ChatAnthropic
+        return ChatAnthropic(
+            anthropic_api_key=api_key,
+            model_name=CLAUDE_MODEL,
+            temperature=DEFAULT_TEMPERATURE,
+            max_retries=3
+        )
+    elif api_key.startswith("sk-"):  # OpenAI API key format
+        from langchain_openai import ChatOpenAI
+        return ChatOpenAI(
+            openai_api_key=api_key,
+            model_name=OPENAI_MODEL,
+            temperature=DEFAULT_TEMPERATURE,
+            max_retries=3
+        )
+    else:  # Default to Gemini
+        from langchain_google_genai import ChatGoogleGenerativeAI
+        os.environ["GOOGLE_API_KEY"] = api_key
+        return ChatGoogleGenerativeAI(
+            model=GEMINI_MODEL,
+            temperature=DEFAULT_TEMPERATURE,
+            max_retries=3
+        )
+def segment_and_analyze_text(text: str, api_key: str, course_name="", section_name="", lesson_name="") -> Dict[str, Any]:
+    from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_GEMINI
+    if api_key.startswith("sk-ant-"):
+        return generate_with_claude(text, api_key, course_name, section_name, lesson_name)
+    # For other models, use LangChain
+    llm = get_llm_by_api_key(api_key)
+    prompt = ANALYSIS_PROMPT_TEMPLATE_GEMINI.format(
+        course_name=course_name,
+        section_name=section_name,
+        lesson_name=lesson_name,
+        text=text
+    )
+    try:
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt}
+        ]
+        response = llm.invoke(messages)
+        try:
+            content = response.content
+            json_match = re.search(r'```json\s*([\s\S]*?)\s*```', content)
+            if json_match:
+                json_str = json_match.group(1)
+            else:
+                json_match = re.search(r'(\{[\s\S]*\})', content)
+                if json_match:
+                    json_str = json_match.group(1)
+                else:
+                    json_str = content
+            # Parse the JSON
+            function_call = json.loads(json_str)
+            return function_call
+        except json.JSONDecodeError:
+            raise Exception("Could not parse JSON from LLM response")
+    except Exception as e:
+        raise Exception(f"Error calling API: {str(e)}")
 def format_quiz_for_display(results):
     output = []
+    if "course_info" in results:
+        course_info = results["course_info"]
+        output.append(f"{'='*40}")
+        output.append(f"COURSE: {course_info.get('course_name', 'N/A')}")
+        output.append(f"SECTION: {course_info.get('section_name', 'N/A')}")
+        output.append(f"LESSON: {course_info.get('lesson_name', 'N/A')}")
+        output.append(f"{'='*40}\n")
+    segments = results.get("segments", [])
+    for i, segment in enumerate(segments):
         topic = segment["topic_name"]
+        segment_num = i + 1
         output.append(f"\n\n{'='*40}")
         output.append(f"SEGMENT {segment_num}: {topic}")
         output.append(f"{'='*40}\n")
         output.append("KEY CONCEPTS:")
         for concept in segment["key_concepts"]:
             output.append(f"• {concept}")
         output.append("\nSUMMARY:")
         output.append(segment["summary"])
         output.append("\nQUIZ QUESTIONS:")
         for i, q in enumerate(segment["quiz_questions"]):
             output.append(f"\n{i+1}. {q['question']}")
             for j, option in enumerate(q['options']):
+                letter = chr(97 + j).upper()
                 correct_marker = " ✓" if option["correct"] else ""
                 output.append(f"   {letter}. {option['text']}{correct_marker}")
     return "\n".join(output)
+def analyze_document(text, api_key, course_name, section_name, lesson_name):
     try:
+        start_time = time.time()
+        # Split text if it's too long
+        text_parts = split_text_by_tokens(text)
+        all_results = {
+            "course_info": {
+                "course_name": course_name,
+                "section_name": section_name,
+                "lesson_name": lesson_name
+            },
+            "segments": []
+        }
+        segment_counter = 1
+        # Process each part of the text
+        for part in text_parts:
+            analysis = segment_and_analyze_text(
+                part,
+                api_key,
+                course_name=course_name,
+                section_name=section_name,
+                lesson_name=lesson_name
+            )
+            if "segments" in analysis:
+                for segment in analysis["segments"]:
+                    segment["segment_number"] = segment_counter
+                    all_results["segments"].append(segment)
+                    segment_counter += 1
+        end_time = time.time()
+        total_time = end_time - start_time
+        # Format the results for display
+        formatted_text = format_quiz_for_display(all_results)
+        formatted_text = f"Total processing time: {total_time:.2f} seconds\n\n" + formatted_text
+        # Create temporary files for JSON and text output
+        json_path = tempfile.mktemp(suffix='.json')
+        with open(json_path, 'w', encoding='utf-8') as json_file:
+            json.dump(all_results, json_file, indent=2)
+        txt_path = tempfile.mktemp(suffix='.txt')
+        with open(txt_path, 'w', encoding='utf-8') as txt_file:
+            txt_file.write(formatted_text)
+        return formatted_text, json_path, txt_path
     except Exception as e:
+        error_message = f"Error processing document: {str(e)}"
+        return error_message, None, None
 with gr.Blocks(title="Quiz Generator") as app:
     gr.Markdown("# Quiz Generator")
+    with gr.Row():
+        with gr.Column():
+            course_name = gr.Textbox(
+                placeholder="Enter the course name",
+                label="Course Name"
+            )
+            section_name = gr.Textbox(
+                placeholder="Enter the section name",
+                label="Section Name"
+            )
+            lesson_name = gr.Textbox(
+                placeholder="Enter the lesson name",
+                label="Lesson Name"
+            )
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(
             )
             api_key = gr.Textbox(
+                label="API Key",
+                placeholder="Enter your OpenAI, Claude, or Gemini API key",
                 type="password"
             )
     analyze_btn.click(
         fn=analyze_document,
+        inputs=[input_text, api_key, course_name, section_name, lesson_name],
         outputs=[output_results, json_file_output, txt_file_output]
     )
 if __name__ == "__main__":
     app.launch()