Spaces:

IW2025
/

InclusiveWorldChatbot

Sleeping

App Files Files Community

IW2025 commited on Jul 16, 2025

Commit

e5b03ae

verified ·

1 Parent(s): 9fc11fb

Upload 10 files

Browse files

Files changed (11) hide show

.gitattributes +6 -0
Slides/.DS_Store +0 -0
Slides/Copy of Week 4 Lesson 2.pptx (1).pdf +3 -0
Slides/Copy of Week 4 Lesson.pptx (2).pdf +3 -0
Slides/Copy of Week 6 lesson.pptx (1).pdf +3 -0
Slides/Copy of Week 7 lesson.pptx.pdf +3 -0
Slides/Copy of week 5 lesson.pptx.pdf +3 -0
Slides/Sreekar - week 5 lesson.pptx.pdf +3 -0
app.py +335 -0
app_config.toml +31 -0
requirements.txt +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]4[[:space:]]Lesson[[:space:]]2.pptx[[:space:]](1).pdf filter=lfs diff=lfs merge=lfs -text
+Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]4[[:space:]]Lesson.pptx[[:space:]](2).pdf filter=lfs diff=lfs merge=lfs -text
+Slides/Copy[[:space:]]of[[:space:]]week[[:space:]]5[[:space:]]lesson.pptx.pdf filter=lfs diff=lfs merge=lfs -text
+Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]6[[:space:]]lesson.pptx[[:space:]](1).pdf filter=lfs diff=lfs merge=lfs -text
+Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]7[[:space:]]lesson.pptx.pdf filter=lfs diff=lfs merge=lfs -text
+Slides/Sreekar[[:space:]]-[[:space:]]week[[:space:]]5[[:space:]]lesson.pptx.pdf filter=lfs diff=lfs merge=lfs -text

Slides/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Slides/Copy of Week 4 Lesson 2.pptx (1).pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:599121f746db2f8e9da2e96d83122f02e940fa49830e3404d5359054672eddb2
+size 245349

Slides/Copy of Week 4 Lesson.pptx (2).pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a09f25bb816d3e73e84184e0aae715fd9b008d573a31ccc25769d696d1c1e21
+size 307124

Slides/Copy of Week 6 lesson.pptx (1).pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ee7535c27d3c649a8ad7fbd8e3e9b362c92c4c5f50f797b0a08d89e140789dc
+size 689156

Slides/Copy of Week 7 lesson.pptx.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f296602886643fec267981b85b5d4ce0a54c8cfde56aca42b80a9fbbe87e6004
+size 316333

Slides/Copy of week 5 lesson.pptx.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98a2e6632af2ddedf1efad7ec386c2ca8ea6161bfc1d63390eed22f6ca4a9943
+size 338567

Slides/Sreekar - week 5 lesson.pptx.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc94f5cfa4d28eece6bbf077dd62dfc92878724413cbaf2e37aa102d931235d9
+size 338571

app.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import gradio as gr
+import os
+from pathlib import Path
+import fitz  # PyMuPDF
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFacePipeline
+from langchain.prompts import PromptTemplate
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+from typing import List, Dict, Any
+import re
+class CurriculumAssistant:
+    def __init__(self):
+        self.vector_db = None
+        self.qa_chain = None
+        self.embeddings = None
+        self.llm = None
+        self.curriculum_docs = []
+        self.pdf_pages = {}  # Store page-level information
+    def load_llm(self):
+        """Load the LLaMA 3.1 model from Hugging Face"""
+        try:
+            model_name = "microsoft/DialoGPT-medium"
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
+            )
+            pipe = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                max_new_tokens=256,
+                temperature=0.7,
+                top_p=0.95,
+                repetition_penalty=1.15
+            )
+            self.llm = HuggingFacePipeline(pipeline=pipe)
+            return True
+        except Exception as e:
+            print(f"Error loading model: {str(e)}")
+            return False
+    def extract_text_from_pdf_with_pages(self, pdf_path: str) -> Dict[int, str]:
+        """Extract text from PDF file with page numbers"""
+        try:
+            doc = fitz.open(pdf_path)
+            pages = {}
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                text = page.get_text()
+                if text.strip():  # Only store non-empty pages
+                    pages[page_num + 1] = text.strip()
+            doc.close()
+            return pages
+        except Exception as e:
+            print(f"Error extracting text from {pdf_path}: {str(e)}")
+            return {}
+    def process_curriculum(self, slides_dir: str):
+        """Process all PDF files in the slides directory"""
+        try:
+            slides_path = Path(slides_dir)
+            pdf_files = list(slides_path.glob("*.pdf"))
+            if not pdf_files:
+                print("No PDF files found in the Slides directory!")
+                return False
+            all_texts = []
+            all_chunks_with_metadata = []
+            for pdf_file in pdf_files:
+                print(f"Processing: {pdf_file.name}")
+                # Extract text with page information
+                pages = self.extract_text_from_pdf_with_pages(str(pdf_file))
+                self.pdf_pages[pdf_file.name] = pages
+                # Combine all pages for vector database
+                full_text = "\n\n".join([f"Page {page_num}: {text}" for page_num, text in pages.items()])
+                if full_text:
+                    all_texts.append(full_text)
+                    self.curriculum_docs.append({
+                        'filename': pdf_file.name,
+                        'content': full_text[:500] + "..." if len(full_text) > 500 else full_text,
+                        'pages': pages
+                    })
+            if not all_texts:
+                print("No text could be extracted from PDF files!")
+                return False
+            # Split text into chunks with metadata
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=1000,
+                chunk_overlap=200,
+                length_function=len,
+            )
+            for i, text in enumerate(all_texts):
+                chunks = text_splitter.split_text(text)
+                for j, chunk in enumerate(chunks):
+                    # Add metadata to track which document and approximate page
+                    all_chunks_with_metadata.append({
+                        'text': chunk,
+                        'metadata': {
+                            'filename': pdf_files[i].name,
+                            'chunk_id': j,
+                            'source': 'curriculum'
+                        }
+                    })
+            # Create embeddings
+            self.embeddings = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-MiniLM-L6-v2"
+            )
+            # Create vector database with metadata
+            texts = [chunk['text'] for chunk in all_chunks_with_metadata]
+            metadatas = [chunk['metadata'] for chunk in all_chunks_with_metadata]
+            self.vector_db = Chroma.from_texts(
+                texts=texts,
+                embedding=self.embeddings,
+                metadatas=metadatas,
+                persist_directory="./chroma_db"
+            )
+            print(f"Processed {len(pdf_files)} curriculum documents!")
+            return True
+        except Exception as e:
+            print(f"Error processing curriculum: {str(e)}")
+            return False
+    def create_qa_chain(self):
+        """Create the QA chain with custom prompts"""
+        if not self.vector_db or not self.llm:
+            return False
+        # Custom prompt template for Q&A
+        qa_template = """You are an expert programming instructor for the Inclusive World Curriculum.
+        Use the following context to answer the student's question. If the information is not in the context,
+        provide a helpful response based on your knowledge of programming concepts.
+        Context: {context}
+        Question: {question}
+        Answer:"""
+        self.qa_chain = RetrievalQA.from_chain_type(
+            llm=self.llm,
+            chain_type="stuff",
+            retriever=self.vector_db.as_retriever(search_kwargs={"k": 5}),
+            chain_type_kwargs={
+                "prompt": PromptTemplate(
+                    template=qa_template,
+                    input_variables=["context", "question"]
+                )
+            }
+        )
+        return True
+    def find_relevant_pages(self, question: str, filename: str = None) -> List[Dict]:
+        """Find relevant pages for a given question"""
+        try:
+            # Search for relevant chunks
+            results = self.vector_db.similarity_search(question, k=5)
+            relevant_pages = []
+            seen_pages = set()
+            for result in results:
+                metadata = result.metadata
+                doc_filename = metadata.get('filename', '')
+                # If filename is specified, only look in that file
+                if filename and doc_filename != filename:
+                    continue
+                # Extract page information from chunk text
+                chunk_text = result.page_content
+                # Look for page numbers in the chunk
+                page_matches = re.findall(r'Page (\d+):', chunk_text)
+                for page_num in page_matches:
+                    page_key = f"{doc_filename}_page_{page_num}"
+                    if page_key not in seen_pages:
+                        seen_pages.add(page_key)
+                        # Get the actual page content
+                        if doc_filename in self.pdf_pages:
+                            page_content = self.pdf_pages[doc_filename].get(int(page_num), "")
+                            if page_content:
+                                relevant_pages.append({
+                                    'filename': doc_filename,
+                                    'page_number': int(page_num),
+                                    'content': page_content,
+                                    'relevance_score': len(chunk_text)  # Simple relevance metric
+                                })
+            # Sort by relevance and return top results
+            relevant_pages.sort(key=lambda x: x['relevance_score'], reverse=True)
+            return relevant_pages[:3]  # Return top 3 most relevant pages
+        except Exception as e:
+            print(f"Error finding relevant pages: {str(e)}")
+            return []
+def initialize_system():
+    """Initialize the curriculum assistant system"""
+    assistant = CurriculumAssistant()
+    # Load LLM
+    if not assistant.load_llm():
+        return "❌ Failed to load language model", None, None
+    # Process curriculum
+    if not assistant.process_curriculum("Slides"):
+        return "❌ Failed to process curriculum documents", None, None
+    # Create QA chain
+    if not assistant.create_qa_chain():
+        return "❌ Failed to create QA chain", None, None
+    return "✅ System initialized successfully!", assistant, assistant.curriculum_docs
+def ask_question(question: str, assistant: CurriculumAssistant):
+    """Ask a question and get answer with relevant pages"""
+    if not assistant or not assistant.qa_chain:
+        return "Please initialize the system first.", "", ""
+    try:
+        # Get answer from QA chain
+        answer = assistant.qa_chain.run(question)
+        # Find relevant pages
+        relevant_pages = assistant.find_relevant_pages(question)
+        # Format page information
+        page_info = ""
+        if relevant_pages:
+            page_info = "📄 **Relevant Pages Found:**\n\n"
+            for i, page in enumerate(relevant_pages, 1):
+                page_info += f"**{i}. {page['filename']} - Page {page['page_number']}**\n"
+                page_info += f"```\n{page['content'][:300]}...\n```\n\n"
+        else:
+            page_info = "No specific pages found for this question."
+        # Format the complete response
+        full_response = f"## Answer\n\n{answer}\n\n---\n\n{page_info}"
+        return full_response, answer, page_info
+    except Exception as e:
+        error_msg = f"Error processing question: {str(e)}"
+        return error_msg, "", ""
+# Initialize the system
+status, assistant, curriculum_docs = initialize_system()
+# Create Gradio interface
+with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎓 Inclusive World Curriculum Assistant")
+    gr.Markdown("An AI-powered assistant that answers questions about your curriculum and shows relevant slide pages.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Status display
+            status_display = gr.Textbox(
+                value=status,
+                label="System Status",
+                interactive=False
+            )
+            # Question input
+            question_input = gr.Textbox(
+                label="Ask a question about your curriculum",
+                placeholder="e.g., What are if statements? How do loops work?",
+                lines=3
+            )
+            # Submit button
+            submit_btn = gr.Button("🔍 Get Answer", variant="primary")
+            # Answer output
+            answer_output = gr.Markdown(
+                label="Answer with Relevant Pages",
+                value="Ask a question to get started!"
+            )
+        with gr.Column(scale=1):
+            # Curriculum overview
+            gr.Markdown("### 📚 Curriculum Documents")
+            if curriculum_docs:
+                for doc in curriculum_docs:
+                    with gr.Accordion(f"📄 {doc['filename']}", open=False):
+                        gr.Markdown(f"**Preview:** {doc['content']}")
+            else:
+                gr.Markdown("No curriculum documents loaded.")
+    # Handle question submission
+    def process_question(question):
+        return ask_question(question, assistant)
+    submit_btn.click(
+        fn=process_question,
+        inputs=[question_input],
+        outputs=[answer_output]
+    )
+    # Handle Enter key in question input
+    question_input.submit(
+        fn=process_question,
+        inputs=[question_input],
+        outputs=[answer_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(share=True)

app_config.toml ADDED Viewed

	@@ -0,0 +1,31 @@

+[build]
+python_version = "3.11"
+[env]
+HF_HUB_ENABLE_HF_TRANSFER = "1"
+TRANSFORMERS_CACHE = "/tmp/transformers_cache"
+HF_HOME = "/tmp/hf_home"
+[system_packages]
+# Add any system packages if needed
+[models]
+# Preload models for faster startup
+"microsoft/DialoGPT-medium" = "dialo-medium"
+"sentence-transformers/all-MiniLM-L6-v2" = "all-minilm-l6-v2"
+[datasets]
+# Add any datasets if needed
+[hardware]
+# Hardware requirements for Gradio
+cpu = "2"
+memory = "8GB"
+disk = "10GB"
+[gradio]
+# Gradio specific settings
+title = "Inclusive World Curriculum Assistant"
+description = "AI-powered assistant that answers questions about curriculum and shows relevant slide pages"
+theme = "soft"
+share = false

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio==4.44.0
+langchain==0.3.26
+langchain-community==0.3.27
+chromadb==1.0.15
+sentence-transformers==5.0.0
+transformers==4.35.2
+torch==2.0.1
+PyMuPDF==1.23.8
+accelerate==0.24.1
+huggingface-hub==0.19.4
+numpy==1.24.3
+pandas==2.0.3
+scikit-learn==1.3.0
+tiktoken==0.5.1