Spaces:

damoojeje
/

SmartManuals-AI

Sleeping

App Files Files Community

damoojeje commited on May 21

Commit

835a614

verified ·

1 Parent(s): 6728736

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -165

app.py CHANGED Viewed

@@ -1,187 +1,179 @@
-# ✅ app.py (SmartManuals-AI)
-# Hugging Face Space-ready app with multi-model support, PDF upload, and live progress feedback
 import os
-import json
 import fitz  # PyMuPDF
 import nltk
-import chromadb
-import tempfile
-import shutil
 import pytesseract
 import gradio as gr
-from PIL import Image
 from tqdm import tqdm
-from nltk.tokenize import sent_tokenize
 from sentence_transformers import SentenceTransformer, util
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-# ---------------------------
-# 🔧 CONFIG
-# ---------------------------
-pdf_folder = "Manuals"
-output_jsonl_chunks = "chunks.jsonl"
-chroma_path = "./chroma_store"
-collection_name = "manual_chunks"
-chunk_size = 750
-chunk_overlap = 100
-MAX_CONTEXT_CHUNKS = 3
 HF_TOKEN = os.environ.get("HF_TOKEN")
-MODEL_MAP = {
-    "LLaMA 3 (8B)": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "LLaMA 4 Scout (17B)": "meta-llama/Meta-Llama-4-Scout-17B-16E-Instruct",
-    "Gemma 3 (27B)": "google/gemma-3-27b-it",
-    "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
-    "Qwen3 (30B)": "Qwen/Qwen3-30B-A3B"
-}
-# ---------------------------
-# 📥 UTILITIES
-# ---------------------------
 def clean_text(text):
     return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
-def tokenize_sentences(text):
-    nltk.download('punkt', quiet=True)
     return sent_tokenize(text)
-def split_into_chunks(sentences, max_tokens=750, overlap=100):
-    chunks, current_chunk, current_len = [], [], 0
-    for sentence in sentences:
-        token_count = len(sentence.split())
-        if current_len + token_count > max_tokens and current_chunk:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = current_chunk[-overlap:]
-            current_len = sum(len(s.split()) for s in current_chunk)
-        current_chunk.append(sentence)
-        current_len += token_count
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
     return chunks
-def extract_metadata_from_filename(filename):
-    name = filename.lower().replace("_", " ").replace("-", " ")
-    meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"}
-    if "om" in name: meta["doc_type"] = "owner manual"
-    elif "sm" in name: meta["doc_type"] = "service manual"
-    elif "assembly" in name: meta["doc_type"] = "assembly instructions"
-    elif "alert" in name: meta["doc_type"] = "installer alert"
-    elif "parts" in name: meta["doc_type"] = "parts manual"
-    known_models = ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage"]
-    for model in known_models:
-        if model.replace(" ", "") in name.replace(" ", ""):
-            meta["model"] = model
-    return meta
-def extract_text_with_ocr(page):
-    text = page.get_text().strip()
-    if text:
-        return text
-    pix = page.get_pixmap(dpi=300)
-    img_data = pix.tobytes("png")
-    img = Image.open(tempfile.SpooledTemporaryFile())
-    img.fp.write(img_data)
-    img.fp.seek(0)
-    return pytesseract.image_to_string(img).strip()
-# ---------------------------
-# 🧠 EMBEDDING + CHROMA
-# ---------------------------
-def embed_pdfs_from_uploaded(files, progress=gr.Progress(track_tqdm=True)):
-    os.makedirs(pdf_folder, exist_ok=True)
-    temp_chunks = []
-    for file in files:
         filename = os.path.basename(file.name)
-        dst = os.path.join(pdf_folder, filename)
-        shutil.copy(file.name, dst)
-        doc = fitz.open(dst)
-        meta = extract_metadata_from_filename(filename)
-        for page_num, page in enumerate(doc, start=1):
-            text = extract_text_with_ocr(page)
-            sents = tokenize_sentences(clean_text(text))
-            chunks = split_into_chunks(sents, chunk_size, chunk_overlap)
-            for i, chunk in enumerate(chunks):
-                temp_chunks.append({
-                    "chunk_id": f"{filename}::page_{page_num}::chunk_{i+1}",
-                    "source_file": filename,
-                    "page": page_num,
-                    "text": chunk,
-                    **meta
-                })
-    with open(output_jsonl_chunks, "w", encoding="utf-8") as f:
-        for c in temp_chunks:
-            json.dump(c, f)
-            f.write("\n")
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    client = chromadb.PersistentClient(path=chroma_path)
-    if collection_name in [c.name for c in client.list_collections()]:
-        client.delete_collection(collection_name)
-    collection = client.create_collection(collection_name)
-    for i in tqdm(range(0, len(temp_chunks), 16)):
-        batch = temp_chunks[i:i+16]
-        texts = [b["text"] for b in batch]
-        metadatas = [b for b in batch]
-        ids = [b["chunk_id"] for b in batch]
-        embeddings = embedder.encode(texts).tolist()
-        collection.add(documents=texts, ids=ids, metadatas=metadatas, embeddings=embeddings)
-    return collection, embedder
-# ---------------------------
-# 🤖 LLM INFERENCE
-# ---------------------------
-def load_llm(model_key):
-    model_id = MODEL_MAP.get(model_key)
-    if not model_id or not HF_TOKEN:
-        return None, None, None
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, device_map="auto")
-    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
-    return tokenizer, model, pipe
-def generate_answer(pipe, tokenizer, context, query):
-    messages = [
-        {"role": "system", "content": "You are an expert manual assistant. Answer accurately using only the context."},
-        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
-    ]
-    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    output = pipe(prompt)[0]["generated_text"]
-    return output.split("\n")[-1].strip()
-# ---------------------------
-# 🎯 FULL PIPELINE
-# ---------------------------
-def rag_pipeline(query, model_key, files):
-    collection, embedder = embed_pdfs_from_uploaded(files)
-    query_embedding = embedder.encode(query, convert_to_tensor=True)
-    results = collection.query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
-    if not results["documents"]:
-        return "No matches found."
-    context = "\n\n".join(results["documents"][0])
-    tokenizer, model, pipe = load_llm(model_key)
-    if pipe:
-        return generate_answer(pipe, tokenizer, context, query)
-    return "Model could not be loaded."
-# ---------------------------
-# 🖥️ GRADIO UI
-# ---------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("""# 🧠 SmartManuals-AI with Multi-Model RAG
-Upload your PDF manuals and ask smart questions. Choose your preferred LLM.""")
-    with gr.Row():
-        file_upload = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload Manuals")
-    with gr.Row():
-        query_box = gr.Textbox(label="Question")
-        model_selector = gr.Dropdown(label="Choose Model", choices=list(MODEL_MAP.keys()), value="LLaMA 3 (8B)")
-    submit_btn = gr.Button("Run Query")
-    answer_box = gr.Textbox(label="Answer", lines=8)
-    submit_btn.click(fn=rag_pipeline, inputs=[query_box, model_selector, file_upload], outputs=[answer_box])
 demo.launch()

+# ✅ Hugging Face-ready `app.py` for SmartManuals-AI
+# Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback
 import os
 import fitz  # PyMuPDF
 import nltk
+import json
+import io
+import docx2txt
 import pytesseract
+import chromadb
 import gradio as gr
+import torch
 from tqdm import tqdm
+from PIL import Image
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer, util
+from nltk.tokenize import sent_tokenize
+nltk.download("punkt")
+# ----------------------------
+# Configuration
+# ----------------------------
+CHROMA_PATH = "./chroma_store"
+COLLECTION_NAME = "manual_chunks"
+CHUNK_SIZE = 750
+CHUNK_OVERLAP = 100
+MAX_CONTEXT = 3
+HF_MODELS = [
+    "meta-llama/Llama-3-8B-Instruct",
+    "meta-llama/Llama-3.1-8B-Instruct",
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+    "google/gemma-1.1-7b-it",
+    "Qwen/Qwen3-30B-A3B",
+]
 HF_TOKEN = os.environ.get("HF_TOKEN")
+# ----------------------------
+# Utilities
+# ----------------------------
 def clean_text(text):
     return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
+def split_sentences(text):
     return sent_tokenize(text)
+def chunk_sentences(sentences):
+    chunks, chunk, length = [], [], 0
+    for sent in sentences:
+        tokens = len(sent.split())
+        if length + tokens > CHUNK_SIZE:
+            chunks.append(" ".join(chunk))
+            chunk = chunk[-CHUNK_OVERLAP:]
+            length = sum(len(s.split()) for s in chunk)
+        chunk.append(sent)
+        length += tokens
+    if chunk:
+        chunks.append(" ".join(chunk))
     return chunks
+def extract_text_pdf(file):
+    doc = fitz.open(stream=file.read(), filetype="pdf")
+    texts = []
+    for page in doc:
+        text = page.get_text()
+        if not text.strip():
+            pix = page.get_pixmap(dpi=300)
+            img = Image.open(io.BytesIO(pix.tobytes("png")))
+            text = pytesseract.image_to_string(img)
+        texts.append(text)
+    return texts
+def extract_text_docx(file):
+    return [docx2txt.process(file)]
+def extract_metadata(filename):
+    lower = filename.lower()
+    model = next((m for m in [
+        "se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl",
+        "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"
+    ] if m in lower.replace(" ", "")), "unknown")
+    doc_type = "unknown"
+    if "om" in lower or "owner" in lower:
+        doc_type = "owner manual"
+    elif "sm" in lower or "service" in lower:
+        doc_type = "service manual"
+    elif "assembly" in lower:
+        doc_type = "assembly instructions"
+    elif "parts" in lower:
+        doc_type = "parts manual"
+    elif "bulletin" in lower:
+        doc_type = "service bulletin"
+    return model, doc_type
+# ----------------------------
+# Embedding pipeline
+# ----------------------------
+def embed_docs(files, progress=gr.Progress()):
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    try:
+        client.delete_collection(COLLECTION_NAME)
+    except: pass
+    collection = client.create_collection(COLLECTION_NAME)
+    texts, ids, metadatas = [], [], []
+    i = 0
+    for file in progress.tqdm(files, desc="Embedding files"):
         filename = os.path.basename(file.name)
+        ext = filename.lower().split(".")[-1]
+        raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file)
+        model, doc_type = extract_metadata(filename)
+        for page, text in enumerate(raw_texts):
+            sents = split_sentences(clean_text(text))
+            for j, chunk in enumerate(chunk_sentences(sents)):
+                texts.append(chunk)
+                ids.append(f"{filename}::p{page+1}::c{j+1}")
+                metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type})
+                i += 1
+                if len(texts) >= 16:
+                    collection.add(documents=texts, metadatas=metadatas, ids=ids,
+                                   embeddings=embedder.encode(texts).tolist())
+                    texts, metadatas, ids = [], [], []
+    if texts:
+        collection.add(documents=texts, metadatas=metadatas, ids=ids,
+                       embeddings=embedder.encode(texts).tolist())
+    return f"✅ Embedded {i} chunks from {len(files)} files."
+# ----------------------------
+# Querying pipeline
+# ----------------------------
+def query_rag(q, model_name):
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    collection = client.get_collection(COLLECTION_NAME)
+    chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT)
+    context = "\n\n".join(chunks['documents'][0])
+    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a helpful assistant. Only answer from the provided manual context below.
+If unsure, say 'I don't know'.
+<context>
+{context}
+</context>
+<|start_header_id|>user<|end_header_id|>
+{q}<|start_header_id|>assistant<|end_header_id|>"""
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+    model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32)
+    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
+    result = pipe(prompt, max_new_tokens=300)[0]["generated_text"]
+    return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
+# ----------------------------
+# Gradio Interface
+# ----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("""# 🧠 SmartManuals-AI (HF Edition)
+Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""")
+    with gr.Tab("📥 Upload & Embed"):
+        uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple")
+        embed_btn = gr.Button("🚀 Embed Files")
+        embed_output = gr.Textbox(label="Embed Log")
+    with gr.Tab("❓ Ask a Question"):
+        question = gr.Textbox(label="Your Question")
+        model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0])
+        ask_btn = gr.Button("💬 Ask")
+        response = gr.Textbox(label="Answer", lines=8)
+    embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output)
+    ask_btn.click(query_rag, inputs=[question, model_select], outputs=response)
 demo.launch()