samithcs commited on
Commit
6af3122
·
verified ·
1 Parent(s): a216632

added gradio and app file

Browse files
Files changed (2) hide show
  1. app.py +4 -0
  2. gradio_app.py +108 -0
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from gradio_app import iface
2
+
3
+ if __name__ == "__main__":
4
+ iface.launch()
gradio_app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pathlib import Path
3
+ import os
4
+ import re
5
+
6
+ from pipeline.ingest.pdf_parser import PDFParser
7
+ from pipeline.ingest.docx_parser import DOCXParser
8
+ from pipeline.ingest.txt_parser import TXTParser
9
+ from pipeline.ingest.html_parser import HTMLParser
10
+ from pipeline.chunking.fixed_chunker import FixedChunker
11
+ from pipeline.embeddings.sentence_transformer_embed import embed_chunks
12
+ from pipeline.vector_store.faiss_store import FaissStore
13
+ from pipeline.rag.retrieval_engine import answer_question
14
+
15
+ FAISS_INDEX_PATH = "data/faiss.index"
16
+ EMBED_DIM = 384
17
+
18
+ def sanitize_filename(filename):
19
+ return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
20
+
21
+ def process_and_qa(file, question):
22
+ try:
23
+ save_dir = Path("data/raw/")
24
+ save_dir.mkdir(parents=True, exist_ok=True)
25
+ filename = sanitize_filename(getattr(file, "name", "uploaded_file"))
26
+ file_path = save_dir / Path(filename).name
27
+
28
+ content = None
29
+ if hasattr(file, "read"):
30
+ content = file.read()
31
+ elif hasattr(file, "data"):
32
+ content = file.data
33
+ elif isinstance(file, bytes):
34
+ content = file
35
+ elif isinstance(file, str) and os.path.exists(file):
36
+ content = None
37
+ file_path = file
38
+ else:
39
+ return "Invalid file object format!", "Error", "Error"
40
+
41
+ if content:
42
+ with open(file_path, "wb") as f:
43
+ f.write(content)
44
+
45
+ ext = Path(filename).suffix.lower()
46
+ if ext == ".pdf":
47
+ parser = PDFParser()
48
+ elif ext == ".docx":
49
+ parser = DOCXParser()
50
+ elif ext == ".txt":
51
+ parser = TXTParser()
52
+ elif ext in [".html", ".htm"]:
53
+ parser = HTMLParser()
54
+ else:
55
+ return "Unsupported filetype.", "", ""
56
+
57
+
58
+ try:
59
+ text, metadata = parser.extract_text_and_metadata(str(file_path))
60
+ chunks = FixedChunker().chunk(text, chunk_size=512, overlap=64)
61
+ #print(f"Chunks parsed: {len(chunks)}")
62
+ embeddings = embed_chunks(chunks, model_name="all-MiniLM-L6-v2")
63
+ #print(f"Embeddings computed: {len(embeddings)}")
64
+ metadatas = [{} for _ in chunks]
65
+ store = FaissStore(dim=EMBED_DIM, index_path=FAISS_INDEX_PATH)
66
+ if os.path.exists(FAISS_INDEX_PATH):
67
+ store.load()
68
+ store.add_documents(chunks, embeddings, metadatas)
69
+ store.save()
70
+ #print("Index updated.")
71
+ except Exception as e:
72
+ return f"Failed to extract: {repr(e)}", "", ""
73
+
74
+ qa_result = answer_question(
75
+ question=question,
76
+ embed_model="all-MiniLM-L6-v2",
77
+ store_type="faiss",
78
+ store_kwargs={"dim": EMBED_DIM, "index_path": FAISS_INDEX_PATH},
79
+ llm_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
80
+ top_k=5,
81
+ )
82
+ answer = qa_result["answer"]
83
+ matched_chunks = qa_result.get("chunks", [])
84
+ #print("QA chunks:", matched_chunks)
85
+ context = "\n\n---\n\n".join([c["text"] for c in matched_chunks]) if matched_chunks else "No supporting context found."
86
+ return f"Preview (first 500 chars):\n{text[:500]}", answer, context
87
+
88
+ except Exception as e:
89
+ # print("GRADIO ERROR:", str(e))
90
+ return f"Error: {e}", "Error", "Error"
91
+
92
+ iface = gr.Interface(
93
+ fn=process_and_qa,
94
+ inputs=[
95
+ gr.File(label="Upload PDF, DOCX, TXT, or HTML"),
96
+ gr.Textbox(label="Question"),
97
+ ],
98
+ outputs=[
99
+ gr.Textbox(label="Extracted/Text Preview", lines=10, show_copy_button=True),
100
+ gr.Textbox(label="Answer", lines=6, show_copy_button=True),
101
+ gr.Textbox(label="Matched Context", lines=12, show_copy_button=True)
102
+ ],
103
+ title="Book/Document QA",
104
+ description="Upload your document, ask a question, and see the answer with cited context!"
105
+ )
106
+
107
+ if __name__ == "__main__":
108
+ iface.launch(server_name="0.0.0.0", server_port=7860)