Spaces:

rohangbs
/

Finetune

Sleeping

App Files Files Community

rohangbs commited on Jan 4, 2025

Commit

79b08ef

verified ·

1 Parent(s): b907b51

Create app.py

Browse files

Files changed (1) hide show

app.py +283 -0

app.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import streamlit as st
+import json
+import os
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from PyPDF2 import PdfReader
+from openai import OpenAI
+import time
+from PIL import Image
+class IntegratedChatSystem:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.client = OpenAI(api_key=api_key)
+        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        self.embedding_dim = 384
+        self.index = faiss.IndexFlatIP(self.embedding_dim)
+        self.metadata = []
+        self.fine_tuned_model = None
+    def add_image(self, image, context_text: str):
+        """Add an image and its context to the retrieval system"""
+        try:
+            # Generate embedding for the context text
+            embedding = self.embedding_model.encode(context_text)
+            embedding = np.expand_dims(embedding, axis=0)
+            # Save image and add to index
+            if not os.path.exists('uploaded_images'):
+                os.makedirs('uploaded_images')
+            # Generate unique filename
+            filename = f"image_{len(self.metadata)}.jpg"
+            image_path = os.path.join('uploaded_images', filename)
+            # Save image
+            image.save(image_path)
+            # Add to FAISS index
+            self.index.add(embedding)
+            self.metadata.append({
+                "filepath": image_path,
+                "context": context_text
+            })
+            return True
+        except Exception as e:
+            st.error(f"Error adding image: {str(e)}")
+            return False
+    def search_relevant_images(self, query: str, similarity_threshold: float = 0.7, top_k: int = 3):
+        """Search for relevant images based on query"""
+        try:
+            if self.index.ntotal == 0:
+                return []
+            # Generate embedding for the query
+            query_embedding = self.embedding_model.encode(query)
+            query_embedding = np.expand_dims(query_embedding, axis=0)
+            # Search in the index
+            distances, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal))
+            # Filter results based on similarity threshold
+            relevant_images = [
+                self.metadata[i] for i, distance in zip(indices[0], distances[0])
+                if i != -1 and distance >= similarity_threshold
+            ]
+            return relevant_images
+        except Exception as e:
+            st.error(f"Error searching images: {str(e)}")
+            return []
+    def generate_qna_pairs(self, text: str):
+        """Generate question-answer pairs from text using OpenAI API"""
+        try:
+            completion = self.client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": "Generate 11 relevant question-answer pairs from the given text. Format each pair as a complete, informative question with its corresponding detailed answer."},
+                    {"role": "user", "content": f"Text: {text}"}
+                ],
+                temperature=0.7
+            )
+            response_text = completion.choices[0].message.content
+            qa_pairs = []
+            pairs = response_text.split('\n\n')
+            for pair in pairs:
+                if 'Q:' in pair and 'A:' in pair:
+                    question = pair.split('A:')[0].replace('Q:', '').strip()
+                    answer = pair.split('A:')[1].strip()
+                    qa_pairs.append({
+                        "messages": [
+                            {"role": "system", "content": "You are an assistant chatbot. You should help the user by answering their question."},
+                            {"role": "user", "content": question},
+                            {"role": "assistant", "content": answer}
+                        ]
+                    })
+            return qa_pairs
+        except Exception as e:
+            st.error(f"Error generating QA pairs: {str(e)}")
+            return []
+    def create_fine_tuning_job(self, training_file_id):
+        try:
+            response = self.client.fine_tuning.jobs.create(
+                training_file=training_file_id,
+                model="gpt-3.5-turbo-0125"
+            )
+            return response.id
+        except Exception as e:
+            st.error(f"Error creating fine-tuning job: {str(e)}")
+            return None
+    def monitor_fine_tuning_job(self, job_id):
+        try:
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            details_text = st.empty()
+            stages = {
+                "validating_files": "Validating training files...",
+                "queued": "Job queued - waiting to start...",
+                "running": "Training in progress...",
+                "succeeded": "Training completed successfully!",
+                "failed": "Training failed.",
+                "cancelled": "Training was cancelled."
+            }
+            # Approximate progress percentages for each stage
+            progress_mapping = {
+                "validating_files": 0.1,
+                "queued": 0.2,
+                "running": 0.6,
+                "succeeded": 1.0,
+                "failed": 1.0,
+                "cancelled": 1.0
+            }
+            last_status = None
+            start_time = time.time()
+            while True:
+                job_status = self.client.fine_tuning.jobs.retrieve(job_id)
+                current_status = job_status.status
+                # Update progress bar
+                progress_bar.progress(progress_mapping.get(current_status, 0))
+                # Update status message
+                status_message = stages.get(current_status, "Processing...")
+                status_text.markdown(f"**Status:** {status_message}")
+                # Show elapsed time and other details
+                elapsed_time = int(time.time() - start_time)
+                details_text.markdown(f"""
+                    **Details:**
+                    - Time elapsed: {elapsed_time // 60}m {elapsed_time % 60}s
+                    - Job ID: {job_id}
+                    - Current stage: {current_status}
+                """)
+                # Status changed notification
+                if current_status != last_status:
+                    if current_status == "running":
+                        st.info("🚀 Model training has begun!")
+                    elif current_status == "succeeded":
+                        st.success("✅ Fine-tuning completed successfully!")
+                        self.fine_tuned_model = job_status.fine_tuned_model
+                        st.balloons()  # Celebration effect
+                        # Display model details
+                        st.markdown(f"""
+                            **Training Completed!**
+                            - Model ID: `{self.fine_tuned_model}`
+                            - Total training time: {elapsed_time // 60}m {elapsed_time % 60}s
+                            - Status: Ready to use
+                            You can now use the chat interface to interact with your fine-tuned model!
+                        """)
+                        return True
+                    elif current_status in ["failed", "cancelled"]:
+                        st.error(f"❌ Training {current_status}. Please check the OpenAI dashboard for details.")
+                        return False
+                last_status = current_status
+                time.sleep(10)
+        except Exception as e:
+            st.error(f"Error monitoring fine-tuning job: {str(e)}")
+            return False
+# Initialize Streamlit interface
+st.title("PDF Fine-tuning and Chat System with Image Retrieval")
+# Initialize session state
+if 'chat_system' not in st.session_state:
+    api_key = "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"
+    st.session_state.chat_system = IntegratedChatSystem(api_key)
+# Sidebar for image upload
+with st.sidebar:
+    st.header("Image Upload")
+    uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
+    image_context = st.text_area("Image Context Description")
+    if uploaded_image and image_context and st.button("Add Image"):
+        image = Image.open(uploaded_image)
+        if st.session_state.chat_system.add_image(image, image_context):
+            st.success("Image added successfully!")
+# Main area tabs
+tab1, tab2 = st.tabs(["Fine-tuning", "Chat"])
+with tab1:
+    st.header("Upload and Fine-tune")
+    uploaded_file = st.file_uploader("Upload a PDF for Fine-Tuning", type=["pdf"])
+    if uploaded_file is not None:
+        if st.button("Process and Fine-tune"):
+            with st.spinner("Processing PDF..."):
+                # Extract text from PDF
+                reader = PdfReader(uploaded_file)
+                text = "\n".join([page.extract_text() for page in reader.pages])
+                # Show processing steps
+                progress_placeholder = st.empty()
+                # Step 1: Generate QA pairs
+                progress_placeholder.text("Step 1/3: Generating QA pairs...")
+                qa_pairs = st.session_state.chat_system.generate_qna_pairs(text)
+                if qa_pairs:
+                    # Step 2: Save and upload training file
+                    progress_placeholder.text("Step 2/3: Preparing training file...")
+                    jsonl_file = "questions_and_answers.jsonl"
+                    with open(jsonl_file, 'w') as f:
+                        for pair in qa_pairs:
+                            json.dump(pair, f)
+                            f.write("\n")
+                    with open(jsonl_file, "rb") as f:
+                        response = st.session_state.chat_system.client.files.create(
+                            file=f,
+                            purpose="fine-tune"
+                        )
+                        training_file_id = response.id
+                    # Step 3: Start fine-tuning
+                    progress_placeholder.text("Step 3/3: Starting fine-tuning process...")
+                    job_id = st.session_state.chat_system.create_fine_tuning_job(training_file_id)
+                    if job_id:
+                        progress_placeholder.empty()  # Clear the step indicator
+                        st.info(f"🎯 Fine-tuning job initiated!")
+                        st.session_state.chat_system.monitor_fine_tuning_job(job_id)
+with tab2:
+    st.header("Chat Interface")
+    if st.session_state.chat_system.fine_tuned_model:
+        st.success(f"Using fine-tuned model: {st.session_state.chat_system.fine_tuned_model}")
+    else:
+        st.info("Using default model (fine-tuned model not available)")
+    user_message = st.text_input("Enter your message:")
+    if st.button("Send") and user_message:
+        result = st.session_state.chat_system.chat(user_message)
+        st.write("Response:", result["response"])
+        if result["relevant_images"]:
+            st.subheader("Relevant Images:")
+            for img_data in result["relevant_images"]:
+                if os.path.exists(img_data["filepath"]):
+                    image = Image.open(img_data["filepath"])
+                    st.image(image, caption=img_data["context"])