Spaces:
Sleeping
Sleeping
| # β SmartManuals-AI: Hugging Face Space App (RAM Safe, Multi-model, No Preview) | |
| import os, json, fitz, torch, chromadb, docx | |
| import gradio as gr | |
| from PIL import Image | |
| from nltk.tokenize import sent_tokenize | |
| from sentence_transformers import SentenceTransformer, util | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from tqdm import tqdm | |
| # --------------------------- | |
| # βοΈ Constants | |
| # --------------------------- | |
| MANUALS_DIR = "Manuals" | |
| CHROMA_PATH = "./chroma_store" | |
| CHUNKS_JSONL = "manual_chunks.jsonl" | |
| COLLECTION_NAME = "manual_chunks" | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| CHUNK_SIZE = 750 | |
| CHUNK_OVERLAP = 100 | |
| TOP_K = 3 | |
| MODEL_OPTIONS = { | |
| "LLaMA 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct", | |
| "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3", | |
| "Gemma 7B": "google/gemma-7b-it", | |
| "Qwen3 7B": "Qwen/Qwen1.5-7B-Chat" | |
| } | |
| # --------------------------- | |
| # π Extract Text from PDFs and DOCX | |
| # --------------------------- | |
| def extract_text_from_pdf(path): | |
| text = "" | |
| try: | |
| doc = fitz.open(path) | |
| for page in doc: | |
| page_text = page.get_text() | |
| text += page_text + "\n" | |
| doc.close() | |
| except Exception as e: | |
| print(f"β PDF Error in {path}: {e}") | |
| return text | |
| def extract_text_from_docx(path): | |
| try: | |
| doc = docx.Document(path) | |
| return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) | |
| except Exception as e: | |
| print(f"β DOCX Error in {path}: {e}") | |
| return "" | |
| # --------------------------- | |
| # π§Ή Clean + Chunk | |
| # --------------------------- | |
| def clean(text): | |
| return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) | |
| def split_sentences(text): | |
| return sent_tokenize(text) | |
| def chunk_text(sentences, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): | |
| chunks, chunk, length = [], [], 0 | |
| for sent in sentences: | |
| n = len(sent.split()) | |
| if length + n > size: | |
| if chunk: | |
| chunks.append(" ".join(chunk)) | |
| chunk = chunk[-overlap:] | |
| length = sum(len(s.split()) for s in chunk) | |
| chunk.append(sent) | |
| length += n | |
| if chunk: | |
| chunks.append(" ".join(chunk)) | |
| return chunks | |
| # --------------------------- | |
| # π¦ Embed and Store in Chroma | |
| # --------------------------- | |
| def embed_all(): | |
| print("π Scanning manuals and embedding...") | |
| os.makedirs(CHROMA_PATH, exist_ok=True) | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| if COLLECTION_NAME in [c.name for c in client.list_collections()]: | |
| client.delete_collection(COLLECTION_NAME) | |
| collection = client.create_collection(COLLECTION_NAME) | |
| all_chunks = [] | |
| files = [f for f in os.listdir(MANUALS_DIR) if f.lower().endswith((".pdf", ".docx"))] | |
| for fname in tqdm(files): | |
| path = os.path.join(MANUALS_DIR, fname) | |
| text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path) | |
| text = clean(text) | |
| sents = split_sentences(text) | |
| chunks = chunk_text(sents) | |
| for i, chunk in enumerate(chunks): | |
| all_chunks.append({ | |
| "id": f"{fname}::chunk_{i+1}", | |
| "text": chunk, | |
| "metadata": {"source": fname} | |
| }) | |
| # Batch embed and store | |
| for i in range(0, len(all_chunks), 16): | |
| batch = all_chunks[i:i+16] | |
| docs = [c["text"] for c in batch] | |
| ids = [c["id"] for c in batch] | |
| metas = [c["metadata"] for c in batch] | |
| embs = embedder.encode(docs).tolist() | |
| collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs) | |
| print(f"β Embedded {len(all_chunks)} chunks.") | |
| return collection, embedder | |
| # --------------------------- | |
| # π RAG Search & LLM Answer | |
| # --------------------------- | |
| def ask(query, model_key): | |
| model_id = MODEL_OPTIONS[model_key] | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN) | |
| model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32) | |
| model.to("cuda" if torch.cuda.is_available() else "cpu") | |
| gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
| except Exception as e: | |
| return f"β Model loading failed: {e}" | |
| results = db.query(query_texts=[query], n_results=TOP_K) | |
| chunks = results["documents"][0] | |
| context = "\n\n".join(chunks) | |
| prompt = f"Answer this using only the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:" | |
| try: | |
| res = gen(prompt, max_new_tokens=300, do_sample=False)[0]['generated_text'] | |
| return res.split("Answer:", 1)[-1].strip() | |
| except Exception as e: | |
| return f"β LLM failed: {e}" | |
| # --------------------------- | |
| # βΆοΈ UI | |
| # --------------------------- | |
| db, embedder = embed_all() | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π§ SmartManuals-AI β Ask Your PDF and Word Docs") | |
| with gr.Row(): | |
| qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I calibrate SE3 console?") | |
| model_pick = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose a Model", value="Mistral 7B") | |
| answer = gr.Textbox(label="Answer", lines=8) | |
| ask_btn = gr.Button("Ask") | |
| ask_btn.click(fn=ask, inputs=[qbox, model_pick], outputs=[answer]) | |
| demo.launch() | |