AjayArmugam commited on
Commit
8253f3b
Β·
verified Β·
1 Parent(s): 4b70d5f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +126 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rapidfuzz import fuzz
3
+ import fitz
4
+ import easyocr
5
+ import numpy as np
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+
10
+ # ===== GLOBALS =====
11
+ reader = easyocr.Reader(['en'])
12
+ db = None
13
+
14
+
15
+ # ===== PROCESS PDF =====
16
+ def process_pdf(file):
17
+ global db
18
+
19
+ doc = fitz.open(file.name)
20
+ text = ""
21
+
22
+ for i, page in enumerate(doc[:50]):
23
+ page_text = page.get_text()
24
+
25
+ if len(page_text.strip()) < 50:
26
+ pix = page.get_pixmap()
27
+ img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
28
+ pix.height, pix.width, pix.n
29
+ )
30
+ result = reader.readtext(img)
31
+ page_text = " ".join([r[1] for r in result])
32
+
33
+ text += page_text + "\n"
34
+
35
+ splitter = RecursiveCharacterTextSplitter(
36
+ chunk_size=500,
37
+ chunk_overlap=100
38
+ )
39
+ chunks = splitter.split_text(text)
40
+
41
+ embeddings = HuggingFaceEmbeddings()
42
+ db = FAISS.from_texts(chunks, embeddings)
43
+
44
+ return "βœ… PDF processed! Ask your question now."
45
+
46
+
47
+ # ===== ANSWER FUNCTION =====
48
+ def get_answer(query):
49
+ global db
50
+
51
+ if db is None:
52
+ return "⚠️ Upload and process a PDF first.", ""
53
+
54
+ docs = db.similarity_search(query, k=3)
55
+
56
+ best_sentence = ""
57
+ best_score = 0
58
+ source = ""
59
+
60
+ for doc in docs:
61
+ sentences = doc.page_content.split(".")
62
+
63
+ for sent in sentences:
64
+ sent_clean = sent.strip()
65
+
66
+ if len(sent_clean) < 20:
67
+ continue
68
+
69
+ score = fuzz.partial_ratio(query.lower(), sent_clean.lower())
70
+
71
+ if "is" in sent_clean.lower() or "mode" in sent_clean.lower():
72
+ score += 10
73
+
74
+ if score > best_score:
75
+ best_score = score
76
+ best_sentence = sent_clean
77
+ source = doc.page_content[:200]
78
+
79
+ return best_sentence, source
80
+
81
+
82
+ # ===== CHAT =====
83
+ def chat(user_input, history):
84
+ answer, source = get_answer(user_input)
85
+ history.append((user_input, answer + "\n\nπŸ“Œ Source: " + source))
86
+ return "", history
87
+
88
+
89
+ # ===== FEEDBACK =====
90
+ def feedback(msg):
91
+ print("Feedback:", msg)
92
+ return "βœ… Feedback received"
93
+
94
+
95
+ # ===== UI =====
96
+ with gr.Blocks(css="""
97
+ .gradio-container {max-width: 100% !important;}
98
+ """) as demo:
99
+
100
+ gr.Markdown("# πŸ€– DocuMind")
101
+
102
+ file = gr.File(label="πŸ“„ Upload PDF")
103
+ status = gr.Textbox(label="Status")
104
+
105
+ process_btn = gr.Button("Process PDF")
106
+
107
+ chatbot = gr.Chatbot(height=400)
108
+
109
+ with gr.Row():
110
+ txt = gr.Textbox(placeholder="Ask question...")
111
+ send = gr.Button("Send")
112
+
113
+ clear = gr.Button("Clear Chat")
114
+
115
+ gr.Markdown("### πŸ’¬ Feedback")
116
+ fb = gr.Textbox(placeholder="Suggestions...")
117
+ fb_btn = gr.Button("Submit")
118
+ fb_out = gr.Textbox(label="Status")
119
+
120
+ process_btn.click(process_pdf, file, status)
121
+ send.click(chat, [txt, chatbot], [txt, chatbot])
122
+ txt.submit(chat, [txt, chatbot], [txt, chatbot])
123
+ clear.click(lambda: [], None, chatbot)
124
+ fb_btn.click(feedback, fb, fb_out)
125
+
126
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ rapidfuzz
3
+ pymupdf
4
+ easyocr
5
+ numpy
6
+ langchain
7
+ langchain-community
8
+ langchain-text-splitters
9
+ faiss-cpu
10
+ sentence-transformers