Deeksha14 commited on
Commit
2e43fd1
·
verified ·
1 Parent(s): 52276c8

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +23 -0
  2. main.py +20 -0
  3. requirements.txt +10 -0
  4. streamlit_app.py +150 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official lightweight Python image
2
+ FROM python:3.10-slim
3
+
4
+ # Set environment variables to disable usage stats collection (to prevent write errors)
5
+ ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
6
+ ENV STREAMLIT_DISABLE_WATCHDOG_WARNINGS=true
7
+ ENV STREAMLIT_SERVER_HEADLESS=true
8
+ ENV STREAMLIT_SERVER_PORT=7860
9
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
10
+ ENV HOME=/tmp
11
+
12
+ # Set working directory
13
+ WORKDIR /app
14
+
15
+ # Copy requirements and install
16
+ COPY requirements.txt .
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy the rest of the code
20
+ COPY . .
21
+
22
+ # Run the app
23
+ CMD ["streamlit", "run", "streamlit_app.py", "--server.port=7860", "--server.address=0.0.0.0"]
main.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+
3
+ app = modal.App("chatpdf-app")
4
+
5
+ image = (
6
+ modal.Image.debian_slim()
7
+ .pip_install_from_requirements("requirements.txt")
8
+ .with_file("/root/app/streamlit_app.py", local_path="streamlit_app.py")
9
+ )
10
+
11
+ @app.function(image=image)
12
+ @modal.web_server(port=7860, startup_timeout=120)
13
+ def launch():
14
+ import subprocess
15
+ import sys
16
+ subprocess.run(
17
+ ["streamlit", "run", "/root/app/streamlit_app.py", "--server.port=7860", "--server.address=0.0.0.0"],
18
+ stdout=sys.stdout,
19
+ stderr=sys.stderr
20
+ )
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ python-dotenv
4
+ langchain
5
+ langchain-community
6
+ langchain-google-genai
7
+ faiss-cpu
8
+ PyPDF2
9
+ python-docx
10
+ beautifulsoup4
streamlit_app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from docx import Document
4
+ from bs4 import BeautifulSoup
5
+ import os
6
+ import google.generativeai as genai
7
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.chains.question_answering import load_qa_chain
11
+ from langchain.prompts import PromptTemplate
12
+
13
+ # ========================
14
+ # 1️⃣ Configuration
15
+ # ========================
16
+ api_key = os.getenv("GOOGLE_API_KEY")
17
+ if not api_key:
18
+ st.error("GOOGLE_API_KEY not found. Please set it in Modal Secrets.")
19
+ st.stop()
20
+
21
+ genai.configure(api_key=api_key)
22
+
23
+ # ========================
24
+ # 2️⃣ File Size Limits
25
+ # ========================
26
+ MAX_TOTAL_SIZE_MB = 5
27
+ MAX_FILE_SIZE_MB = 2
28
+
29
+ def validate_file_sizes(uploaded_files):
30
+ total_size = 0
31
+ for file in uploaded_files:
32
+ size_mb = file.size / (1024 * 1024)
33
+ if size_mb > MAX_FILE_SIZE_MB:
34
+ st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
35
+ return False
36
+ total_size += size_mb
37
+
38
+ if total_size > MAX_TOTAL_SIZE_MB:
39
+ st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
40
+ return False
41
+
42
+ return True
43
+
44
+ # ========================
45
+ # 3️⃣ Text Extraction Functions
46
+ # ========================
47
+ def get_pdf_text(pdf_docs):
48
+ text = ""
49
+ for pdf in pdf_docs:
50
+ pdf_reader = PdfReader(pdf)
51
+ for page in pdf_reader.pages:
52
+ content = page.extract_text()
53
+ if content:
54
+ text += content
55
+ return text
56
+
57
+ def get_docx_text(docx_file):
58
+ doc = Document(docx_file)
59
+ return "\n".join([para.text for para in doc.paragraphs])
60
+
61
+ def get_html_text(html_file):
62
+ content = html_file.read()
63
+ soup = BeautifulSoup(content, "html.parser")
64
+ return soup.get_text()
65
+
66
+ # ========================
67
+ # 4️⃣ Text Chunking and Vector Store
68
+ # ========================
69
+ def get_text_chunks(text):
70
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
71
+ return text_splitter.split_text(text)
72
+
73
+ def get_vector_store(text_chunks):
74
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
75
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
76
+ vector_store.save_local("/tmp/faiss_index") # ✅ Using /tmp for Modal compatibility
77
+
78
+ # ========================
79
+ # 5️⃣ Conversational Chain Setup
80
+ # ========================
81
+ def get_conversational_chain():
82
+ prompt_template = """
83
+ Answer the question as detailed as possible from the provided context. If the answer is not available, say "answer is not available in the context."
84
+
85
+ Context:
86
+ {context}
87
+
88
+ Question:
89
+ {question}
90
+
91
+ Answer:
92
+ """
93
+ model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3)
94
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
95
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
96
+ return chain
97
+
98
+ def user_input(user_question):
99
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
100
+ new_db = FAISS.load_local("/tmp/faiss_index", embeddings, allow_dangerous_deserialization=True)
101
+ docs = new_db.similarity_search(user_question)
102
+ chain = get_conversational_chain()
103
+ response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
104
+ st.write("Reply:", response["output_text"])
105
+
106
+ # ========================
107
+ # 6️⃣ Streamlit App Layout
108
+ # ========================
109
+ def main():
110
+ st.set_page_config(page_title="Chat with Documents")
111
+ st.header("Chat with your PDF, DOCX, or HTML using Gemini 💬")
112
+
113
+ # ✅ Force Streamlit to render immediately → to prevent Modal timeout
114
+ st.write("App loaded successfully ✅. Upload a file from the sidebar to get started.")
115
+
116
+ user_question = st.text_input("Ask a question about your uploaded files:")
117
+
118
+ if user_question:
119
+ user_input(user_question)
120
+
121
+ with st.sidebar:
122
+ st.title("Upload & Process Files")
123
+ uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
124
+
125
+ if st.button("Submit & Process"):
126
+ if not uploaded_files:
127
+ st.warning("Please upload at least one file.")
128
+ return
129
+
130
+ if not validate_file_sizes(uploaded_files):
131
+ return
132
+
133
+ with st.spinner("Processing files..."):
134
+ full_text = ""
135
+ for file in uploaded_files:
136
+ if file.name.endswith(".pdf"):
137
+ full_text += get_pdf_text([file])
138
+ elif file.name.endswith(".docx"):
139
+ full_text += get_docx_text(file)
140
+ elif file.name.endswith(".html"):
141
+ full_text += get_html_text(file)
142
+ else:
143
+ st.warning(f"Unsupported file type: {file.name}")
144
+
145
+ text_chunks = get_text_chunks(full_text)
146
+ get_vector_store(text_chunks)
147
+ st.success("Processing complete!")
148
+
149
+ if __name__ == "__main__":
150
+ main()