Spaces:
Build error
Build error
| import os | |
| import torch | |
| import pandas as pd | |
| import chromadb | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| from chromadb.config import Settings | |
| from transformers import pipeline | |
| # Device setup | |
| device = -1 # Use CPU | |
| print("Device set to: CPU") | |
| # Load CSV data | |
| df = pd.read_csv("iec_college_data.csv").dropna(subset=["content"]).reset_index(drop=True) | |
| # Load embedding model on CPU | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") | |
| # ChromaDB setup | |
| chroma_client = chromadb.PersistentClient(path="./chroma_db") | |
| collection_name = "iec_data" | |
| # Get or create collection | |
| if collection_name not in [col.name for col in chroma_client.list_collections()]: | |
| collection = chroma_client.create_collection(name=collection_name) | |
| else: | |
| collection = chroma_client.get_collection(name=collection_name) | |
| # Only index if collection is empty | |
| if collection.count() == 0: | |
| print("Indexing documents...") | |
| texts = df["content"].tolist() | |
| embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=True) | |
| for idx, (embedding, row) in enumerate(zip(embeddings, df.itertuples())): | |
| metadata = {"title": row.title, "url": row.url} | |
| collection.add( | |
| embeddings=[embedding], | |
| documents=[row.content], | |
| metadatas=[metadata], | |
| ids=[str(idx)] | |
| ) | |
| if idx % 50 == 0: | |
| print(f"Indexed {idx}/{len(df)}") | |
| print("Indexing complete.") | |
| # Use lightweight extractive QA model | |
| qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1) | |
| # QA function | |
| def answer_question(user_question): | |
| question_embedding = embed_model.encode(user_question) | |
| results = collection.query(query_embeddings=[question_embedding], n_results=3) | |
| context = "\n".join(results["documents"][0]) | |
| if len(context.split()) > 400: | |
| context = " ".join(context.split()[:400]) | |
| result = qa_pipeline(question=user_question, context=context) | |
| return result["answer"] | |
| # Gradio UI | |
| iface = gr.Interface( | |
| fn=answer_question, | |
| inputs=gr.Textbox(lines=2, placeholder="Ask about IEC College..."), | |
| outputs=gr.Textbox(label="Answer"), | |
| title="IEC College Assistant", | |
| description="Ask questions about IEC College based on structured data." | |
| ) | |
| iface.launch(share=True) | |