Spaces:
Sleeping
Sleeping
| # β Hugging Face-ready `app.py` for SmartManuals-AI | |
| # Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback | |
| import os | |
| import fitz # PyMuPDF | |
| import nltk | |
| import json | |
| import io | |
| import docx2txt | |
| import pytesseract | |
| import chromadb | |
| import gradio as gr | |
| import torch | |
| from tqdm import tqdm | |
| from PIL import Image | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| from sentence_transformers import SentenceTransformer, util | |
| from nltk.tokenize import sent_tokenize | |
| nltk.download("punkt") | |
| # ---------------------------- | |
| # Configuration | |
| # ---------------------------- | |
| CHROMA_PATH = "./chroma_store" | |
| COLLECTION_NAME = "manual_chunks" | |
| CHUNK_SIZE = 750 | |
| CHUNK_OVERLAP = 100 | |
| MAX_CONTEXT = 3 | |
| HF_MODELS = [ | |
| "meta-llama/Llama-3-8B-Instruct", | |
| "meta-llama/Llama-3.1-8B-Instruct", | |
| "meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| "google/gemma-1.1-7b-it", | |
| "Qwen/Qwen3-30B-A3B", | |
| ] | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| # ---------------------------- | |
| # Utilities | |
| # ---------------------------- | |
| def clean_text(text): | |
| return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) | |
| def split_sentences(text): | |
| return sent_tokenize(text) | |
| def chunk_sentences(sentences): | |
| chunks, chunk, length = [], [], 0 | |
| for sent in sentences: | |
| tokens = len(sent.split()) | |
| if length + tokens > CHUNK_SIZE: | |
| chunks.append(" ".join(chunk)) | |
| chunk = chunk[-CHUNK_OVERLAP:] | |
| length = sum(len(s.split()) for s in chunk) | |
| chunk.append(sent) | |
| length += tokens | |
| if chunk: | |
| chunks.append(" ".join(chunk)) | |
| return chunks | |
| def extract_text_pdf(file): | |
| doc = fitz.open(stream=file.read(), filetype="pdf") | |
| texts = [] | |
| for page in doc: | |
| text = page.get_text() | |
| if not text.strip(): | |
| pix = page.get_pixmap(dpi=300) | |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
| text = pytesseract.image_to_string(img) | |
| texts.append(text) | |
| return texts | |
| def extract_text_docx(file): | |
| return [docx2txt.process(file)] | |
| def extract_metadata(filename): | |
| lower = filename.lower() | |
| model = next((m for m in [ | |
| "se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", | |
| "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c" | |
| ] if m in lower.replace(" ", "")), "unknown") | |
| doc_type = "unknown" | |
| if "om" in lower or "owner" in lower: | |
| doc_type = "owner manual" | |
| elif "sm" in lower or "service" in lower: | |
| doc_type = "service manual" | |
| elif "assembly" in lower: | |
| doc_type = "assembly instructions" | |
| elif "parts" in lower: | |
| doc_type = "parts manual" | |
| elif "bulletin" in lower: | |
| doc_type = "service bulletin" | |
| return model, doc_type | |
| # ---------------------------- | |
| # Embedding pipeline | |
| # ---------------------------- | |
| def embed_docs(files, progress=gr.Progress()): | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| try: | |
| client.delete_collection(COLLECTION_NAME) | |
| except: pass | |
| collection = client.create_collection(COLLECTION_NAME) | |
| texts, ids, metadatas = [], [], [] | |
| i = 0 | |
| for file in progress.tqdm(files, desc="Embedding files"): | |
| filename = os.path.basename(file.name) | |
| ext = filename.lower().split(".")[-1] | |
| raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file) | |
| model, doc_type = extract_metadata(filename) | |
| for page, text in enumerate(raw_texts): | |
| sents = split_sentences(clean_text(text)) | |
| for j, chunk in enumerate(chunk_sentences(sents)): | |
| texts.append(chunk) | |
| ids.append(f"{filename}::p{page+1}::c{j+1}") | |
| metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type}) | |
| i += 1 | |
| if len(texts) >= 16: | |
| collection.add(documents=texts, metadatas=metadatas, ids=ids, | |
| embeddings=embedder.encode(texts).tolist()) | |
| texts, metadatas, ids = [], [], [] | |
| if texts: | |
| collection.add(documents=texts, metadatas=metadatas, ids=ids, | |
| embeddings=embedder.encode(texts).tolist()) | |
| return f"β Embedded {i} chunks from {len(files)} files." | |
| # ---------------------------- | |
| # Querying pipeline | |
| # ---------------------------- | |
| def query_rag(q, model_name): | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| collection = client.get_collection(COLLECTION_NAME) | |
| chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT) | |
| context = "\n\n".join(chunks['documents'][0]) | |
| prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> | |
| You are a helpful assistant. Only answer from the provided manual context below. | |
| If unsure, say 'I don't know'. | |
| <context> | |
| {context} | |
| </context> | |
| <|start_header_id|>user<|end_header_id|> | |
| {q}<|start_header_id|>assistant<|end_header_id|>""" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN) | |
| model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32) | |
| pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) | |
| result = pipe(prompt, max_new_tokens=300)[0]["generated_text"] | |
| return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() | |
| # ---------------------------- | |
| # Gradio Interface | |
| # ---------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("""# π§ SmartManuals-AI (HF Edition) | |
| Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""") | |
| with gr.Tab("π₯ Upload & Embed"): | |
| uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple") | |
| embed_btn = gr.Button("π Embed Files") | |
| embed_output = gr.Textbox(label="Embed Log") | |
| with gr.Tab("β Ask a Question"): | |
| question = gr.Textbox(label="Your Question") | |
| model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0]) | |
| ask_btn = gr.Button("π¬ Ask") | |
| response = gr.Textbox(label="Answer", lines=8) | |
| embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output) | |
| ask_btn.click(query_rag, inputs=[question, model_select], outputs=response) | |
| demo.launch() | |