| # # --------------------------------------------------------------------------------------- | |
| # # Imports and Options | |
| # # --------------------------------------------------------------------------------------- | |
| # import streamlit as st | |
| # import pandas as pd | |
| # import requests | |
| # import re | |
| # import fitz # PyMuPDF | |
| # import io | |
| # import matplotlib.pyplot as plt | |
| # from PIL import Image | |
| # from transformers import AutoProcessor, AutoModelForVision2Seq | |
| # from docling_core.types.doc import DoclingDocument | |
| # from docling_core.types.doc.document import DocTagsDocument | |
| # import torch | |
| # import os | |
| # from huggingface_hub import InferenceClient | |
| # # --------------------------------------------------------------------------------------- | |
| # # Streamlit Page Configuration | |
| # # --------------------------------------------------------------------------------------- | |
| # st.set_page_config( | |
| # page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App", | |
| # page_icon=":bar_chart:", | |
| # layout="centered", | |
| # initial_sidebar_state="auto", | |
| # menu_items={ | |
| # 'Get Help': 'mailto:support@mtss.ai', | |
| # 'About': "This app is built to support PDF analysis" | |
| # } | |
| # ) | |
| # # --------------------------------------------------------------------------------------- | |
| # # Session State Initialization | |
| # # --------------------------------------------------------------------------------------- | |
| # for key in ['pdf_processed', 'markdown_texts', 'df']: | |
| # if key not in st.session_state: | |
| # st.session_state[key] = False if key == 'pdf_processed' else [] | |
| # # --------------------------------------------------------------------------------------- | |
| # # API Configuration | |
| # # --------------------------------------------------------------------------------------- | |
| # # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7" | |
| # # headers = { | |
| # # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805', | |
| # # 'Content-Type': 'application/json' | |
| # # } | |
| # # Retrieve Hugging Face API key from environment variables | |
| # hf_api_key = os.getenv('HF_API_KEY') | |
| # if not hf_api_key: | |
| # raise ValueError("HF_API_KEY not set in environment variables") | |
| # # Create the Hugging Face inference client | |
| # client = InferenceClient(api_key=hf_api_key) | |
| # # # --------------------------------------------------------------------------------------- | |
| # # # Survey Analysis Class | |
| # # # --------------------------------------------------------------------------------------- | |
| # # class SurveyAnalysis: | |
| # # def prepare_llm_input(self, survey_response, topics): | |
| # # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()]) | |
| # # return f"""Extract and summarize PDF notes based on topics: | |
| # # {topic_descriptions} | |
| # # Instructions: | |
| # # - Extract exact quotes per topic. | |
| # # - Ignore irrelevant topics. | |
| # # Format: | |
| # # [Topic] | |
| # # - "Exact quote" | |
| # # Meeting Notes: | |
| # # {survey_response} | |
| # # """ | |
| # # def query_api(self, payload): | |
| # # try: | |
| # # res = requests.post(API_URL, headers=headers, json=payload, timeout=60) | |
| # # res.raise_for_status() | |
| # # return res.json() | |
| # # except requests.exceptions.RequestException as e: | |
| # # st.error(f"API request failed: {e}") | |
| # # return {'outputs': {'out-0': ''}} | |
| # # def extract_meeting_notes(self, response): | |
| # # return response.get('outputs', {}).get('out-0', '') | |
| # # def process_dataframe(self, df, topics): | |
| # # results = [] | |
| # # for _, row in df.iterrows(): | |
| # # llm_input = self.prepare_llm_input(row['Document_Text'], topics) | |
| # # payload = {"user_id": "user", "in-0": llm_input} | |
| # # response = self.query_api(payload) | |
| # # notes = self.extract_meeting_notes(response) | |
| # # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes}) | |
| # # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1) | |
| # # --------------------------------------------------------------------------------------- | |
| # # Survey Analysis Class | |
| # # --------------------------------------------------------------------------------------- | |
| # class SurveyAnalysis: | |
| # def prepare_llm_input(self, survey_response, topics): | |
| # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()]) | |
| # return f"""Extract and summarize PDF notes based on topics: | |
| # {topic_descriptions} | |
| # Instructions: | |
| # - Extract exact quotes per topic. | |
| # - Ignore irrelevant topics. | |
| # Format: | |
| # [Topic] | |
| # - "Exact quote" | |
| # Meeting Notes: | |
| # {survey_response} | |
| # """ | |
| # def prompt_response_from_hf_llm(self, llm_input): | |
| # # Define a system prompt to guide the model's responses | |
| # system_prompt = """ | |
| # <Persona> An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.</Persona> | |
| # <Task> Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.</Task> | |
| # <Context> Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. </Context> | |
| # <Format> Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.</Format> | |
| # """ | |
| # # Generate the refined prompt using Hugging Face API | |
| # response = client.chat.completions.create( | |
| # model="meta-llama/Llama-3.1-70B-Instruct", | |
| # messages=[ | |
| # {"role": "system", "content": system_prompt}, # Add system prompt here | |
| # {"role": "user", "content": llm_input} | |
| # ], | |
| # stream=True, | |
| # temperature=0.5, | |
| # max_tokens=1024, | |
| # top_p=0.7 | |
| # ) | |
| # # Combine messages if response is streamed | |
| # response_content = "" | |
| # for message in response: | |
| # response_content += message.choices[0].delta.content | |
| # return response_content.strip() | |
| # def extract_text(self, response): | |
| # return response | |
| # def process_dataframe(self, df, topics): | |
| # results = [] | |
| # for _, row in df.iterrows(): | |
| # llm_input = self.prepare_llm_input(row['Document_Text'], topics) | |
| # response = self.prompt_response_from_hf_llm(llm_input) | |
| # notes = self.extract_text(response) | |
| # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes}) | |
| # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1) | |
| # # --------------------------------------------------------------------------------------- | |
| # # Helper Functions | |
| # # --------------------------------------------------------------------------------------- | |
| # @st.cache_resource | |
| # def load_smol_docling(): | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") | |
| # model = AutoModelForVision2Seq.from_pretrained( | |
| # "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32 | |
| # ).to(device) | |
| # return model, processor | |
| # model, processor = load_smol_docling() | |
| # def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600): | |
| # images = [] | |
| # doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| # for page in doc: | |
| # pix = page.get_pixmap(dpi=dpi) | |
| # img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") | |
| # img.thumbnail((max_size, max_size), Image.LANCZOS) | |
| # images.append(img) | |
| # return images | |
| # def extract_markdown_from_image(image): | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True) | |
| # inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device) | |
| # with torch.no_grad(): | |
| # generated_ids = model.generate(**inputs, max_new_tokens=1024) | |
| # doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("<end_of_utterance>", "").strip() | |
| # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) | |
| # doc = DoclingDocument(name="ExtractedDocument") | |
| # doc.load_from_doctags(doctags_doc) | |
| # return doc.export_to_markdown() | |
| # def extract_excerpts(processed_df): | |
| # rows = [] | |
| # for _, r in processed_df.iterrows(): | |
| # for sec in re.split(r'\n(?=\[)', r['Topic_Summary']): | |
| # topic_match = re.match(r'\[([^\]]+)\]', sec) | |
| # if topic_match: | |
| # topic = topic_match.group(1) | |
| # excerpts = re.findall(r'- "([^"]+)"', sec) | |
| # for excerpt in excerpts: | |
| # rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic}) | |
| # return pd.DataFrame(rows) | |
| # # --------------------------------------------------------------------------------------- | |
| # # Streamlit UI | |
| # # --------------------------------------------------------------------------------------- | |
| # st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App") | |
| # uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) | |
| # if uploaded_file and not st.session_state['pdf_processed']: | |
| # with st.spinner("Processing PDF..."): | |
| # images = convert_pdf_to_images(uploaded_file) | |
| # markdown_texts = [extract_markdown_from_image(img) for img in images] | |
| # st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts}) | |
| # st.session_state['pdf_processed'] = True | |
| # st.success("PDF processed successfully!") | |
| # if st.session_state['pdf_processed']: | |
| # st.markdown("### Extracted Text Preview") | |
| # st.write(st.session_state['df'].head()) | |
| # st.markdown("### Enter Topics and Descriptions") | |
| # num_topics = st.number_input("Number of topics", 1, 10, 1) | |
| # topics = {} | |
| # for i in range(num_topics): | |
| # topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}") | |
| # desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}") | |
| # if topic and desc: | |
| # topics[topic] = desc | |
| # if st.button("Run Analysis"): | |
| # if not topics: | |
| # st.warning("Please enter at least one topic and description.") | |
| # st.stop() | |
| # analyzer = SurveyAnalysis() | |
| # processed_df = analyzer.process_dataframe(st.session_state['df'], topics) | |
| # extracted_df = extract_excerpts(processed_df) | |
| # st.markdown("### Extracted Excerpts") | |
| # st.dataframe(extracted_df) | |
| # csv = extracted_df.to_csv(index=False) | |
| # st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv") | |
| # topic_counts = extracted_df['Topic'].value_counts() | |
| # fig, ax = plt.subplots() | |
| # topic_counts.plot.bar(ax=ax, color='#3d9aa1') | |
| # st.pyplot(fig) | |
| # if not uploaded_file: | |
| # st.info("Please upload a PDF file to begin.") | |
| # --------------------------------------------------------------------------------------- | |
| # Imports and Options | |
| # --------------------------------------------------------------------------------------- | |
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| import re | |
| import fitz # PyMuPDF | |
| import io | |
| import matplotlib.pyplot as plt | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| from docling_core.types.doc import DoclingDocument | |
| from docling_core.types.doc.document import DocTagsDocument | |
| import torch | |
| import os | |
| from huggingface_hub import InferenceClient | |
| # --------------------------------------------------------------------------------------- | |
| # Streamlit Page Configuration | |
| # --------------------------------------------------------------------------------------- | |
| st.set_page_config( | |
| page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App", | |
| page_icon=":bar_chart:", | |
| layout="centered", | |
| initial_sidebar_state="auto", | |
| menu_items={ | |
| 'Get Help': 'mailto:support@mtss.ai', | |
| 'About': "This app is built to support PDF analysis" | |
| } | |
| ) | |
| # --------------------------------------------------------------------------------------- | |
| # Session State Initialization | |
| # --------------------------------------------------------------------------------------- | |
| for key in ['pdf_processed', 'markdown_texts', 'df']: | |
| if key not in st.session_state: | |
| st.session_state[key] = False if key == 'pdf_processed' else [] | |
| # --------------------------------------------------------------------------------------- | |
| # API Configuration | |
| # --------------------------------------------------------------------------------------- | |
| # Retrieve Hugging Face API key from environment variables | |
| hf_api_key = os.getenv('HF_API_KEY') | |
| if not hf_api_key: | |
| raise ValueError("HF_API_KEY not set in environment variables") | |
| # Create the Hugging Face inference client | |
| client = InferenceClient(api_key=hf_api_key) | |
| # --------------------------------------------------------------------------------------- | |
| # Survey Analysis Class | |
| # --------------------------------------------------------------------------------------- | |
| class SurveyAnalysis: | |
| def prepare_llm_input(self, survey_response, topics): | |
| topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()]) | |
| return f"""Extract and summarize PDF notes based on topics: | |
| {topic_descriptions} | |
| Instructions: | |
| - Extract exact quotes per topic. | |
| - Ignore irrelevant topics. | |
| Format: | |
| [Topic] | |
| - "Exact quote" | |
| Meeting Notes: | |
| {survey_response} | |
| """ | |
| def prompt_response_from_hf_llm(self, llm_input): | |
| # Define a system prompt to guide the model's responses | |
| system_prompt = """ | |
| <Persona> An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.</Persona> | |
| <Task> Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.</Task> | |
| <Context> Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. </Context> | |
| <Format> Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.</Format> | |
| """ | |
| # Generate the refined prompt using Hugging Face API | |
| response = client.chat.completions.create( | |
| model="meta-llama/Llama-3.1-70B-Instruct", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, # Add system prompt here | |
| {"role": "user", "content": llm_input} | |
| ], | |
| stream=True, | |
| temperature=0.5, | |
| max_tokens=1024, | |
| top_p=0.7 | |
| ) | |
| # Combine messages if response is streamed | |
| response_content = "" | |
| for message in response: | |
| response_content += message.choices[0].delta.content | |
| return response_content.strip() | |
| def extract_text(self, response): | |
| return response | |
| def process_dataframe(self, df, topics): | |
| results = [] | |
| for _, row in df.iterrows(): | |
| llm_input = self.prepare_llm_input(row['Document_Text'], topics) | |
| response = self.prompt_response_from_hf_llm(llm_input) | |
| print("AI Response:", response) # Debugging: print the AI response | |
| notes = self.extract_text(response) | |
| results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes}) | |
| return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1) | |
| # --------------------------------------------------------------------------------------- | |
| # Helper Functions | |
| # --------------------------------------------------------------------------------------- | |
| def load_smol_docling(): | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") | |
| model = AutoModelForVision2Seq.from_pretrained( | |
| "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32 | |
| ).to(device) | |
| return model, processor | |
| model, processor = load_smol_docling() | |
| def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600): | |
| images = [] | |
| doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| for page in doc: | |
| pix = page.get_pixmap(dpi=dpi) | |
| img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") | |
| img.thumbnail((max_size, max_size), Image.LANCZOS) | |
| images.append(img) | |
| return images | |
| def extract_markdown_from_image(image): | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True) | |
| inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| generated_ids = model.generate(**inputs, max_new_tokens=1024) | |
| doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("<end_of_utterance>", "").strip() | |
| doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) | |
| doc = DoclingDocument(name="ExtractedDocument") | |
| doc.load_from_doctags(doctags_doc) | |
| return doc.export_to_markdown() | |
| def extract_excerpts(processed_df): | |
| rows = [] | |
| for _, r in processed_df.iterrows(): | |
| for sec in re.split(r'\n(?=\[)', r['Topic_Summary']): | |
| topic_match = re.match(r'\[([^\]]+)\]', sec) | |
| if topic_match: | |
| topic = topic_match.group(1) | |
| excerpts = re.findall(r'- "([^"]+)"', sec) | |
| for excerpt in excerpts: | |
| rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic}) | |
| print("Extracted Rows:", rows) # Debugging: print extracted rows | |
| return pd.DataFrame(rows) | |
| # --------------------------------------------------------------------------------------- | |
| # Streamlit UI | |
| # --------------------------------------------------------------------------------------- | |
| st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App") | |
| uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) | |
| if uploaded_file and not st.session_state['pdf_processed']: | |
| with st.spinner("Processing PDF..."): | |
| images = convert_pdf_to_images(uploaded_file) | |
| markdown_texts = [extract_markdown_from_image(img) for img in images] | |
| st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts}) | |
| st.session_state['pdf_processed'] = True | |
| st.success("PDF processed successfully!") | |
| if st.session_state['pdf_processed']: | |
| st.markdown("### Extracted Text Preview") | |
| st.write(st.session_state['df'].head()) | |
| st.markdown("### Enter Topics and Descriptions") | |
| num_topics = st.number_input("Number of topics", 1, 10, 1) | |
| topics = {} | |
| for i in range(num_topics): | |
| topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}") | |
| desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}") | |
| if topic and desc: | |
| topics[topic] = desc | |
| if st.button("Run Analysis"): | |
| if not topics: | |
| st.warning("Please enter at least one topic and description.") | |
| st.stop() | |
| analyzer = SurveyAnalysis() | |
| processed_df = analyzer.process_dataframe(st.session_state['df'], topics) | |
| extracted_df = extract_excerpts(processed_df) | |
| st.markdown("### Extracted Excerpts") | |
| st.dataframe(extracted_df) | |
| csv = extracted_df.to_csv(index=False) | |
| st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv") | |
| if not extracted_df.empty: | |
| topic_counts = extracted_df['Topic'].value_counts() | |
| fig, ax = plt.subplots() | |
| topic_counts.plot.bar(ax=ax, color='#3d9aa1') | |
| st.pyplot(fig) | |
| else: | |
| st.warning("No topics were extracted. Please check the input data and topics.") | |
| if not uploaded_file: | |
| st.info("Please upload a PDF file to begin.") |