Spaces:

damoojeje
/

SmartManuals-AI

Sleeping

App Files Files Community

damoojeje commited on May 21

Commit

6f368e7

verified ·

1 Parent(s): 86bcdf2

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -157

app.py CHANGED Viewed

@@ -1,179 +1,176 @@
-# ✅ Hugging Face-ready `app.py` for SmartManuals-AI
-# Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback
 import os
 import fitz  # PyMuPDF
 import nltk
-import json
-import io
-import docx2txt
-import pytesseract
 import chromadb
-import gradio as gr
-import torch
 from tqdm import tqdm
-from PIL import Image
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
-from sentence_transformers import SentenceTransformer, util
 from nltk.tokenize import sent_tokenize
-nltk.download("punkt")
-# ----------------------------
-# Configuration
-# ----------------------------
 CHROMA_PATH = "./chroma_store"
 COLLECTION_NAME = "manual_chunks"
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
-MAX_CONTEXT = 3
-HF_MODELS = [
-    "meta-llama/Llama-3-8B-Instruct",
-    "meta-llama/Llama-3.1-8B-Instruct",
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    "google/gemma-1.1-7b-it",
-    "Qwen/Qwen3-30B-A3B",
-]
-HF_TOKEN = os.environ.get("HF_TOKEN")
-# ----------------------------
-# Utilities
-# ----------------------------
-def clean_text(text):
-    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
-def split_sentences(text):
-    return sent_tokenize(text)
-def chunk_sentences(sentences):
-    chunks, chunk, length = [], [], 0
     for sent in sentences:
-        tokens = len(sent.split())
-        if length + tokens > CHUNK_SIZE:
-            chunks.append(" ".join(chunk))
-            chunk = chunk[-CHUNK_OVERLAP:]
-            length = sum(len(s.split()) for s in chunk)
-        chunk.append(sent)
-        length += tokens
-    if chunk:
-        chunks.append(" ".join(chunk))
     return chunks
-def extract_text_pdf(file):
-    doc = fitz.open(stream=file.read(), filetype="pdf")
-    texts = []
-    for page in doc:
-        text = page.get_text()
-        if not text.strip():
-            pix = page.get_pixmap(dpi=300)
-            img = Image.open(io.BytesIO(pix.tobytes("png")))
-            text = pytesseract.image_to_string(img)
-        texts.append(text)
-    return texts
-def extract_text_docx(file):
-    return [docx2txt.process(file)]
-def extract_metadata(filename):
-    lower = filename.lower()
-    model = next((m for m in [
-        "se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl",
-        "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"
-    ] if m in lower.replace(" ", "")), "unknown")
-    doc_type = "unknown"
-    if "om" in lower or "owner" in lower:
-        doc_type = "owner manual"
-    elif "sm" in lower or "service" in lower:
-        doc_type = "service manual"
-    elif "assembly" in lower:
-        doc_type = "assembly instructions"
-    elif "parts" in lower:
-        doc_type = "parts manual"
-    elif "bulletin" in lower:
-        doc_type = "service bulletin"
-    return model, doc_type
-# ----------------------------
-# Embedding pipeline
-# ----------------------------
-def embed_docs(files, progress=gr.Progress()):
-    embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    client = chromadb.PersistentClient(path=CHROMA_PATH)
-    try:
         client.delete_collection(COLLECTION_NAME)
-    except: pass
     collection = client.create_collection(COLLECTION_NAME)
-    texts, ids, metadatas = [], [], []
-    i = 0
-    for file in progress.tqdm(files, desc="Embedding files"):
-        filename = os.path.basename(file.name)
-        ext = filename.lower().split(".")[-1]
-        raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file)
-        model, doc_type = extract_metadata(filename)
-        for page, text in enumerate(raw_texts):
-            sents = split_sentences(clean_text(text))
-            for j, chunk in enumerate(chunk_sentences(sents)):
-                texts.append(chunk)
-                ids.append(f"{filename}::p{page+1}::c{j+1}")
-                metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type})
-                i += 1
-                if len(texts) >= 16:
-                    collection.add(documents=texts, metadatas=metadatas, ids=ids,
-                                   embeddings=embedder.encode(texts).tolist())
-                    texts, metadatas, ids = [], [], []
-    if texts:
-        collection.add(documents=texts, metadatas=metadatas, ids=ids,
-                       embeddings=embedder.encode(texts).tolist())
-    return f"✅ Embedded {i} chunks from {len(files)} files."
-# ----------------------------
-# Querying pipeline
-# ----------------------------
-def query_rag(q, model_name):
-    embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    client = chromadb.PersistentClient(path=CHROMA_PATH)
-    collection = client.get_collection(COLLECTION_NAME)
-    chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT)
-    context = "\n\n".join(chunks['documents'][0])
-    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You are a helpful assistant. Only answer from the provided manual context below.
-If unsure, say 'I don't know'.
-<context>
-{context}
-</context>
-<|start_header_id|>user<|end_header_id|>
-{q}<|start_header_id|>assistant<|end_header_id|>"""
     tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32)
-    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
-    result = pipe(prompt, max_new_tokens=300)[0]["generated_text"]
-    return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
-# ----------------------------
-# Gradio Interface
-# ----------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("""# 🧠 SmartManuals-AI (HF Edition)
-Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""")
-    with gr.Tab("📥 Upload & Embed"):
-        uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple")
-        embed_btn = gr.Button("🚀 Embed Files")
-        embed_output = gr.Textbox(label="Embed Log")
-    with gr.Tab("❓ Ask a Question"):
-        question = gr.Textbox(label="Your Question")
-        model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0])
-        ask_btn = gr.Button("💬 Ask")
-        response = gr.Textbox(label="Answer", lines=8)
-    embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output)
-    ask_btn.click(query_rag, inputs=[question, model_select], outputs=response)
-demo.launch()

+# ✅ app.py — Hugging Face Space Version (Finalized)
+# RAG over local PDFs/DOCX using Hugging Face-hosted models with Chroma
 import os
+import json
 import fitz  # PyMuPDF
 import nltk
 import chromadb
 from tqdm import tqdm
 from nltk.tokenize import sent_tokenize
+from sentence_transformers import SentenceTransformer, util
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import pytesseract
+from PIL import Image
+import io
+import docx2txt
+import gradio as gr
+# ---------------------------
+# ✅ Configuration
+# ---------------------------
+MANUALS_DIR = "./Manuals"  # Folder containing all PDF and DOCX files
 CHROMA_PATH = "./chroma_store"
+CHUNKS_PATH = "chunks.jsonl"
 COLLECTION_NAME = "manual_chunks"
+MAX_CONTEXT_CHUNKS = 3
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
+HF_TOKEN = os.environ.get("HF_TOKEN")
+LLM_MODELS = {
+    "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
+    "LLaMA 3 8B": "meta-llama/Llama-3-8B-Instruct",
+    "LLaMA 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "Mistral": "mistralai/Mistral-7B-Instruct-v0.3",
+    "Gemma": "google/gemma-1.1-7b-it",
+    "Qwen 3 30B": "Qwen/Qwen3-30B-A3B",
+}
+# ---------------------------
+# ✅ Setup
+# ---------------------------
+nltk.download('punkt')
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+client = chromadb.PersistentClient(path=CHROMA_PATH)
+collection = None
+# ---------------------------
+# 📄 Load all PDFs and DOCX content
+# ---------------------------
+def extract_all_documents():
+    chunks = []
+    for fname in os.listdir(MANUALS_DIR):
+        path = os.path.join(MANUALS_DIR, fname)
+        if fname.lower().endswith(".pdf"):
+            doc = fitz.open(path)
+            for i, page in enumerate(doc):
+                text = page.get_text().strip()
+                if not text:
+                    pix = page.get_pixmap(dpi=300)
+                    img = Image.open(io.BytesIO(pix.tobytes("png")))
+                    text = pytesseract.image_to_string(img)
+                if text.strip():
+                    chunks.append((fname, i + 1, text.strip()))
+        elif fname.lower().endswith(".docx"):
+            text = docx2txt.process(path)
+            if text.strip():
+                chunks.append((fname, 1, text.strip()))
+    return chunks
+# ---------------------------
+# ✂️ Chunk text
+# ---------------------------
+def split_chunks(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
+    sentences = sent_tokenize(text)
+    chunks, curr, curr_len = [], [], 0
     for sent in sentences:
+        tok_len = len(sent.split())
+        if curr_len + tok_len > size:
+            chunks.append(" ".join(curr))
+            curr = curr[-overlap:]
+            curr_len = sum(len(s.split()) for s in curr)
+        curr.append(sent)
+        curr_len += tok_len
+    if curr:
+        chunks.append(" ".join(curr))
     return chunks
+# ---------------------------
+# 💾 Embed into Chroma
+# ---------------------------
+def embed_documents():
+    global collection
+    if collection:
         client.delete_collection(COLLECTION_NAME)
     collection = client.create_collection(COLLECTION_NAME)
+    docs = extract_all_documents()
+    records = []
+    for fname, page, text in docs:
+        for i, chunk in enumerate(split_chunks(text)):
+            if not chunk.strip():
+                continue
+            records.append({
+                "id": f"{fname}::p{page}::c{i}",
+                "text": chunk,
+                "metadata": {"source_file": fname, "page": page}
+            })
+    for i in tqdm(range(0, len(records), 16)):
+        batch = records[i:i + 16]
+        texts = [b["text"] for b in batch]
+        ids = [b["id"] for b in batch]
+        metas = [b["metadata"] for b in batch]
+        embs = embedder.encode(texts).tolist()
+        collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
+    return f"✅ Embedded {len(records)} chunks"
+# ---------------------------
+# 🔎 Query
+# ---------------------------
+def search_context(query, top_k=MAX_CONTEXT_CHUNKS):
+    results = collection.query(query_texts=[query], n_results=top_k)
+    chunks = results["documents"][0]
+    metas = results["metadatas"][0]
+    return "\n\n".join(
+        f"File: {m['source_file']}, Page: {m['page']}\n{c}" for m, c in zip(metas, chunks)
+    )
+# ---------------------------
+# 🧠 Run Inference
+# ---------------------------
+def ask_model(model_name, query):
+    if not HF_TOKEN:
+        return "❌ HF_TOKEN not set."
+    context = search_context(query)
+    system_prompt = "Answer only using the context. Say 'I don't know' if not found."
+    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|start_header_id|>user<|end_header_id|>{context}\n\nQuestion: {query}<|start_header_id|>assistant<|end_header_id|>"
     tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+    model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, device_map="auto")
+    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
+    output = pipe(prompt, max_new_tokens=512, do_sample=True)[0]["generated_text"]
+    return output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
+# ---------------------------
+# 🎛 Gradio UI
+# ---------------------------
+def launch_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("""
+        # 🧠 SmartManuals-AI (Hugging Face Edition)
+        Upload manuals to `./Manuals`, click Embed, then ask questions.
+        """)
+        with gr.Row():
+            embed_button = gr.Button("⚙️ Embed Documents")
+            embed_status = gr.Textbox(label="Status")
+        with gr.Row():
+            model_select = gr.Dropdown(list(LLM_MODELS.keys()), label="Model", value="LLaMA 3.1 8B")
+            question = gr.Textbox(label="Question")
+        answer = gr.Textbox(label="Answer", lines=10)
+        submit = gr.Button("🔍 Ask")
+        embed_button.click(fn=embed_documents, outputs=embed_status)
+        submit.click(fn=lambda m, q: ask_model(LLM_MODELS[m], q), inputs=[model_select, question], outputs=[answer])
+    demo.launch()
+# ---------------------------
+if __name__ == "__main__":
+    launch_interface()