Spaces:

Krish-05
/

chatbot_final

Sleeping

App Files Files Community

Krish-05 commited on Jul 28, 2025

Commit

83bc15a

verified ·

1 Parent(s): 9764c9e

Update README.md

Browse files

Files changed (1) hide show

README.md +145 -273

README.md CHANGED Viewed

@@ -9,317 +9,189 @@ pinned: false
 license: mit
 ---
-import json
-from collections import defaultdict
-import os
-from datasets import Dataset
-import torch
-from unsloth import FastLanguageModel
-from trl import SFTTrainer
-from transformers import TrainingArguments
-import jsonlines # Recommended for reading .jsonl files
-def remove_duplicates_jsonl(input_file, output_file):
-    """
-    Remove duplicate entries from a JSONL file based on prompt and response.
-    Preserves the first occurrence of each unique entry.
-    Args:
-        input_file (str): Path to input JSONL file
-        output_file (str): Path to output JSONL file where deduplicated data will be written
-    """
-    # Store unique entries using prompt+response as key
-    unique_entries = {}
-    duplicate_count = 0
-    line_count = 0
-    print(f"Processing file: {input_file}")
-    try:
-        # Read input file and track unique entries
-        with open(input_file, 'r', encoding='utf-8') as f:
-            for line_num, line in enumerate(f, 1):
-                try:
-                    # Skip empty lines
-                    if not line.strip():
-                        continue
-                    # Parse JSON line
-                    data = json.loads(line.strip())
-                    line_count += 1
-                    # Create unique key from prompt+response
-                    unique_key = f"{data.get('prompt', '')}|{data.get('response', '')}"
-                    # Track first occurrence of each unique entry
-                    if unique_key not in unique_entries:
-                        unique_entries[unique_key] = data
-                    else:
-                        duplicate_count += 1
-                except json.JSONDecodeError as e:
-                    print(f"Error parsing JSON on line {line_num}: {str(e)}")
-                    continue
-        # Write unique entries to output file
-        with open(output_file, 'w', encoding='utf-8') as f:
-            for data in unique_entries.values():
-                json_str = json.dumps(data, ensure_ascii=False)
-                f.write(json_str + '\n')
-        # Print summary
-        print("\nDeduplication Summary:")
-        print(f"Total lines processed: {line_count}")
-        print(f"Duplicate entries removed: {duplicate_count}")
-        print(f"Unique entries remaining: {len(unique_entries)}")
-        print(f"Output written to: {output_file}")
-    except Exception as e:
-        print(f"Error processing file: {str(e)}")
-        return
-if __name__ == "__main__":
-    input_file = "prompt_response_pairs.jsonl"
-    output_file = "prompt_response_pairs_deduped.jsonl"
-    remove_duplicates_jsonl(input_file, output_file)
-import json
-import jsonlines # Recommended for reading .jsonl files
-# --- 1. Load data from .jsonl file ---
-file_path = "prompt_response_pairs_deduped.jsonl" # Change extension to .jsonl
-# Read .jsonl file
-data = []
-with jsonlines.open(file_path, 'r') as reader:
-    for obj in reader:
-        data.append(obj)
-print(f"Loaded {len(data)} entries from {file_path}")
-if len(data) > 0: # Check if data is not empty before trying to print
-    print("First entry:", data[0])
-# For GPU check
-import torch
-print(f"CUDA available: {torch.cuda.is_available()}")
-print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
-from unsloth import FastLanguageModel
-import torch
-model_name = "unsloth/llama-3-8b-bnb-4bit"
-max_seq_length = 2048  # Choose sequence length
-dtype = None  # Auto detection
-# Load model and tokenizer
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name=model_name,
-    max_seq_length=max_seq_length,
-    dtype=dtype,
-    load_in_4bit=True,
-)
-import json # Still needed for potential other uses, but not directly for response stringifying here
-from datasets import Dataset
-def format_prompt(example):
-    user_prompt = example.get('prompt', '')
-    assistant_response = example.get('response', '')
-    # Llama 3 chat template
-    # We are formatting the training data as a full conversation turn.
-    return f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_response}<|eot_id|>"
-formatted_data = [format_prompt(item) for item in data] # Use 'data' loaded from .jsonl
-dataset = Dataset.from_dict({"text": formatted_data})
-# Add LoRA adapters
-model = FastLanguageModel.get_peft_model(
-    model,
-    r=64,  # LoRA rank - higher = more capacity, more memory
-    target_modules=[
-        "q_proj", "k_proj", "v_proj", "o_proj",
-        "gate_proj", "up_proj", "down_proj",
-    ],
-    lora_alpha=128,  # LoRA scaling factor (usually 2x rank)
-    lora_dropout=0,  # Supports any, but = 0 is optimized
-    bias="none",     # Supports any, but = "none" is optimized
-    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version
-    random_state=3407,
-    use_rslora=False,  # Rank stabilized LoRA
-    loftq_config=None, # LoftQ
-)
-from trl import SFTTrainer
-from transformers import TrainingArguments
-# Training arguments optimized for Unsloth
-trainer = SFTTrainer(
-    model=model,
-    tokenizer=tokenizer,
-    train_dataset=dataset,
-    dataset_text_field="text",
-    max_seq_length=max_seq_length,
-    dataset_num_proc=2,
-    args=TrainingArguments(
-        per_device_train_batch_size=2,
-        gradient_accumulation_steps=4,  # Effective batch size = 8
-        warmup_steps=10,
-        num_train_epochs=3,
-        learning_rate=2e-4,
-        fp16=not torch.cuda.is_bf16_supported(),
-        bf16=torch.cuda.is_bf16_supported(),
-        logging_steps=25,
-        optim="adamw_8bit",
-        weight_decay=0.01,
-        lr_scheduler_type="linear",
-        seed=3407,
-        output_dir="outputs",
-        save_strategy="epoch",
-        save_total_limit=2,
-        dataloader_pin_memory=False,
-    ),
-)
-# Train the model
-trainer_stats = trainer.train()
-print("Training complete!")
-# Option A: Save just the LoRA adapters (most common for continued fine-tuning)
-# This creates a folder with adapter_model.safetensors and tokenizer files.
-lora_model_dir = "./lora_adapters_saved"
-print(f"\nSaving LoRA adapters to: {lora_model_dir}")
-model.save_pretrained(lora_model_dir)
-tokenizer.save_pretrained(lora_model_dir)
-print("LoRA adapters and tokenizer saved!")
-# --- 5. Test the fine-tuned model ---
-FastLanguageModel.for_inference(model) # Enable native 2x
-# Test prompt - adjust the prompt to match the Llama 3 chat template for inference
-# The prompt should be just the user's part of the conversation for inference.
-messages = [
-    {"role": "user", "prompt": "i have a question about cancelling order 12345"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    tokenize=True,
-    add_generation_prompt=True, # This adds the assistant turn for generation
-    return_tensors="pt",
-).to("cuda")
-# Generate response
-print("\nGenerating response with fine-tuned model...")
-outputs = model.generate(
-    input_ids=inputs,
-    max_new_tokens=256,
-    use_cache=True,
-    temperature=0.7,
-    do_sample=True,
-    top_p=0.9,
-)
-# Decode and print
-response = tokenizer.decode(outputs[0], skip_special_tokens=False) # Keep tokens for inspection
-print(response)
-# To get just the generated assistant part, you'd typically parse the response string.
-try:
-    assistant_start = response.find("<|start_header_id|>assistant<|end_header_id|>\n\n")
-    if assistant_start != -1:
-        generated_text = response[assistant_start + len("<|start_header_id|>assistant<|end_header_id|>\n\n"):].strip()
-        # Remove any trailing <|eot_id|> or other special tokens if present
-        generated_text = generated_text.replace("<|eot_id|>", "").strip()
-        print("\nExtracted Generated Output:")
-        print(generated_text)
-except Exception as e:
-    print(f"Could not extract generated text: {e}")
-# --- 6. Save the model in GGUF format ---
-# This will save the model with LoRA adapters merged into the base model and quantized.
-# The `model.save_pretrained_gguf` function *already* handles merging
-# and quantization internally before saving to GGUF.
-gguf_output_dir = "gguf_model"
-os.makedirs(gguf_output_dir, exist_ok=True) # Ensure the directory exists
-print(f"\nSaving model to GGUF format in: {gguf_output_dir}")
-model.save_pretrained_gguf(gguf_output_dir, tokenizer, quantization_method="q4_k_m")
-print("Model saved in GGUF format!")
-Fine-Tuning Details
-Base Model: unsloth/llama-3-8b-bnb-4bit (an optimized Llama 3 variant for efficient fine-tuning).
-Method: LoRA (Low-Rank Adaptation) is used for efficient fine-tuning.
-Tokenizer: Uses the Llama 3 chat template to format prompts and responses for training.
-Training Arguments: Configured for adamw_8bit optimizer, linear learning rate scheduler, and saving the best model.
-Output Format: The fine-tuned LoRA adapters are saved, and the model is also saved in GGUF format for local deployment with Ollama.
-Architecture and Workflow
-The chatbot operates with a full-stack architecture orchestrated by Nginx, leveraging FastAPI for the backend logic and a React.js frontend for the user interface. Ollama is used to serve the fine-tuned LLM locally within the container.
-Conceptual architecture similar to a Hugging Face Space deployment.
-Components:
-Frontend (React.js): Provides the user interface for interacting with the chatbot, including a text input area and a voice input feature.
-Nginx: Acts as a reverse proxy, routing requests from the frontend to the appropriate backend services (FastAPI). It also serves the static frontend files.
-FastAPI (Python):
-LLM Endpoint (/api/ask): Receives text prompts from the frontend, formats them, sends them to the Ollama-served LLM, and streams the generated responses back to the frontend.
-Audio Transcription Endpoint (/api/transcribe-audio): Receives audio blobs from the frontend, uses a Whisper model (loaded within FastAPI) to transcribe the audio, and returns the transcribed text.
-Ollama: A local large language model server. It runs the fine-tuned krishna_choudhary/tinyllama model, making it available for inference via an API that FastAPI interacts with.
-Whisper Model (dimavz/whisper-tiny): Integrated within the FastAPI application for speech-to-text functionality.
-Workflow:
-User Input: The user types a message or records a voice message in the React frontend.
-Voice to Text (if applicable):
-If a voice message is recorded, the frontend sends the audio blob to FastAPI's /api/transcribe-audio endpoint.
-FastAPI uses the dimavz/whisper-tiny model to transcribe the audio into text.
-The transcribed text is then sent back to the frontend.
-Text Prompt to LLM:
-Whether the input was typed or transcribed, the frontend sends the text prompt to FastAPI's /api/ask endpoint.
-LLM Inference (FastAPI & Ollama):
-FastAPI receives the user prompt.
-It prepares the prompt to match the Llama 3 chat template required by the krishna_choudhary/tinyllama model.
-FastAPI then sends this formatted prompt to the locally running Ollama server.
-Ollama processes the prompt using the fine-tuned krishna_choudhary/tinyllama model.
-The LLM generates a response, which is streamed back to FastAPI.
-Response Handling and Token Replacement:
-FastAPI receives the streaming response from Ollama.
-The fine-tuned model is designed to generate responses containing placeholders (e.g., {{Order Number}}, {{Online Company Portal Info}}).
-FastAPI identifies these placeholders in the LLM's raw output.
-Token Replacement: In a production environment, FastAPI would have logic to dynamically replace these placeholders with actual, real-time data from a database or other internal systems (e.g., fetching a customer's actual order number from a CRM, or providing a real company portal URL). For this project, the model directly generates the response with the placeholders as seen in the training data, demonstrating its ability to recognize and emit structured responses. The actual dynamic replacement logic would be implemented here in FastAPI.
-The (potentially placeholder-replaced) response is streamed back to the frontend.
-Display Response: The React frontend receives the streamed response and displays it to the user, providing a real-time conversational experience.

 license: mit
 ---
+Here's a **cleaned-up, styled, and Hugging Face-compatible version** of your `README.md` — optimized for clarity, presentation, and ease of use on **Hugging Face Spaces**.
+---
+# 🤖 Chatbot with Voice + Text | React + FastAPI + Ollama
+This is a full-stack chatbot application supporting both **text** and **voice input**, powered by:
+* **🧠 Ollama (LLM)** — for chatbot responses using `krishna_choudhary/tinyllama`
+* **🗣️ Ollama (Whisper STT)** — for voice-to-text transcription using `anagram/whispertiny`
+* **⚛️ React** — for a modern, responsive chat interface
+* **⚡ FastAPI** — for backend API endpoints
+---
+## 🚀 Features
+* 💬 **Interactive Chat UI** – Clean and responsive frontend built in React
+* 🎤 **Voice Input (STT)** – Record your voice and transcribe using Whisper
+* 🧠 **Ollama LLM** – Generate AI responses locally using TinyLlama (or your preferred model)
+* 🔁 **Streaming Responses** – Responses are streamed back for a natural chat flow
+* 🐳 **Dockerized** – Easy deployment using Docker
+* 🛠 **Modular Architecture** – Clean separation of concerns (React frontend, FastAPI backend)
+---
+## 🧩 Tech Stack
+| Layer    | Tech                                   |
+| -------- | -------------------------------------- |
+| Frontend | React (Vite)                           |
+| Backend  | FastAPI                                |
+| LLM      | Ollama - `krishna_choudhary/tinyllama` |
+| STT      | Ollama - `anagram/whispertiny`         |
+| Audio    | Web Audio API                          |
+| Infra    | Docker + Nginx (Production)            |
+---
+## 🗂️ Project Structure
+```
+.
+├── Dockerfile              # Full app container (React + FastAPI + Ollama)
+├── start.sh                # Start script: launches Ollama & FastAPI
+├── main.py                 # FastAPI server logic (LLM + STT endpoints)
+├── requirements.txt        # Python dependencies
+├── nginx.conf              # Nginx config for proxying & serving frontend
+└── frontend/
+    ├── src/App.jsx         # React component with chat logic
+    ├── src/App.css         # App styles
+    ├── dist/               # Production build output (after npm build)
+```
+---
+## ⚙️ Workflow Overview
+### 🔹 Frontend (React)
+* User types or records voice.
+* Sends requests to `/api/ask` (text) or `/api/transcribe-audio` (audio).
+* Displays streamed response from LLM.
+### 🔹 Backend (FastAPI)
+* `/api/ask`: Sends prompt to TinyLlama via Ollama API and streams back result.
+* `/api/transcribe-audio`: Converts voice to base64, sends to Whisper model, returns transcribed text.
+### 🔹 Ollama Server
+* Hosts and runs both models locally inside Docker.
+* Communicates via `http://localhost:11434/api/generate`.
+---
+## 🛠️ Local Development (No Docker)
+### 1. Clone Repo
+```bash
+git clone <your-repo-url>
+cd <your-repo-directory>
+```
+### 2. Backend Setup (FastAPI)
+```bash
+pip install -r requirements.txt
+```
+### 3. Frontend Setup (React)
+```bash
+cd frontend
+npm install
+npm run build
+cd ..
+```
+### 4. Start Ollama Locally
+```bash
+ollama pull krishna_choudhary/tinyllama
+ollama pull anagram/whispertiny
+ollama serve
+```
+### 5. Run FastAPI Server
+```bash
+uvicorn main:app --host 0.0.0.0 --port 7860
+```
+Then access via `http://localhost:7860`
+---
+## 🐳 Docker Deployment (Recommended)
+### 1. Build Docker Image
+```bash
+docker build -t chatbot-ollama-app .
+```
+### 2. Run Container
+```bash
+docker run -p 8501:8501 \
+  -e OLLAMA_HOST="http://127.0.0.1:11434" \
+  -e MODEL_NAME="krishna_choudhary/tinyllama" \
+  -e WHISPER_MODEL_NAME="anagram/whispertiny" \
+  chatbot-ollama-app
+```
+### 3. Access App
+Open your browser at:
+👉 `http://localhost:8501`
+---
+## 🎙️ Using the Chatbot
+* 🧑‍💻 **Text Input**: Type a question and hit **Send**.
+* 🗣️ **Voice Input**: Click the mic icon, speak, then click again to transcribe and submit.
+---
+## 🔧 Customization Options
+| What          | How to Customize                                         |
+| ------------- | -------------------------------------------------------- |
+| LLM Model     | Change `MODEL_NAME` in `main.py` or `start.sh`           |
+| Whisper Model | Update `WHISPER_MODEL_NAME` in `main.py` or `start.sh`   |
+| Ollama Host   | Modify `OLLAMA_HOST_URL` in `main.py`                    |
+| Token Speed   | Adjust `asyncio.sleep()` delay inside streaming function |
+---
+## 🧪 Quick Test Without Nginx
+In development, React can directly call backend:
+* Set API base in frontend to `http://localhost:7860/api/`
+* Run React in dev mode:
+```bash
+cd frontend
+npm run dev
+```
+---
+## 🤝 License & Credits
+* Built with ❤️ using [FastAPI](https://fastapi.tiangolo.com/), [React](https://reactjs.org/), and [Ollama](https://ollama.com/)
+* Models:
+  * `krishna_choudhary/tinyllama`
+  * `anagram/whispertiny`
+---
+Let me know if you'd like me to **add Hugging Face-specific buttons**, badges (e.g. "Open in Spaces"), or deployment examples!