okr/sla_analyzer/app.py
2025-01-02 19:39:39 +01:00

106 lines
4.2 KiB
Python

import streamlit as st
import fitz # PyMuPDF for PDF text extraction
import pandas as pd
import json
from utilities.utils import extract_text_from_pdf, chunk_text, construct_prompt_for_pdf
#from utilities.utils import extract_text_from_pdf, chunk_text, construct_prompt_for_pdf
from utilities.api import call_prompt
prompt_template = """
You are an AI assistant analyzing a service level agreement (SLA) document. Extract the following structured information in JSON format:
1. Airline name and ground handling partner.
2. Effective date of the agreement.
3. Airport location.
4. Delay codes penalized, including target ranges and penalties.
5. Baggage loss reasons and corresponding penalties.
Here is the document content:
{pdf_input}
Provide output in JSON format only.
"""
# Streamlit App Layout
st.title("PDF SLA Analyzer")
# File uploader for PDF
st.header("Upload a PDF File")
uploaded_file = st.file_uploader("Choose a PDF file to analyze", type=["pdf"])
# Text input for custom prompt
st.header("Enter Your Custom Prompt")
custom_prompt = st.text_area(
"Custom Prompt",
value="""
You are an AI assistant analyzing a service level agreement (SLA) document. Extract the following structured information in JSON format:
1. Airline name and ground handling partner.
2. Effective date of the agreement.
3. Airport location.
4. Delay codes penalized, including target ranges and penalties.
5. Baggage loss reasons and corresponding penalties.
Provide output in JSON format only.
"""
)
# Button to process the PDF and call the API
if st.button("Analyze"):
if uploaded_file is not None:
# Step 1: Extract text from the uploaded PDF
with st.spinner("Extracting text from PDF..."):
pdf_text = extract_text_from_pdf(uploaded_file)
# Step 2: Chunk the text if necessary
with st.spinner("Chunking large text..."):
chunks = chunk_text(pdf_text)
# Step 3: Query GPT-4-Turbo for each chunk and combine results
all_results = []
with st.spinner("Querying GPT-4-Turbo..."):
for i, chunk in enumerate(chunks):
st.text(f"Processing chunk {i + 1}/{len(chunks)}...")
#prompt = custom_prompt + f"\n\nHere is the document content:\n{chunk}"
prompt = construct_prompt_for_pdf(prompt_template=prompt_template, pdf_input=chunk)
result = call_prompt(prompt)
print("result:", result)
all_results.append(result)
# Combine all results into one JSON object (if applicable)
combined_result = "\n".join(all_results)
print("combined_result:", combined_result)
# Step 4: Display raw JSON response
st.subheader("Raw JSON Response")
try:
json_data = json.loads(combined_result) # Parse JSON string into Python dict
st.json(json_data)
except json.JSONDecodeError:
st.error("Failed to parse JSON response.")
st.text(combined_result) # Show raw response if parsing fails
# Step 5: Display results in a table (if JSON is valid)
if 'delay_penalties' in json_data or 'baggage_loss_penalties' in json_data:
st.subheader("Extracted Data Table")
# Create DataFrame for delay penalties (if available)
if 'delay_penalties' in json_data:
delay_df = pd.DataFrame(json_data.get("delay_penalties", []))
st.write("**Delay Penalties:**")
st.table(delay_df)
# Create DataFrame for baggage loss penalties (if available)
if 'baggage_loss_penalties' in json_data:
baggage_df = pd.DataFrame(json_data.get("baggage_loss_penalties", []))
st.write("**Baggage Loss Penalties:**")
st.table(baggage_df)
# Display other metadata (e.g., airline, airport, etc.)
metadata_keys = ["airline", "ground_handling_partner", "effective_date", "airport"]
metadata = {key: json_data.get(key, "N/A") for key in metadata_keys}
st.write("**Metadata:**")
metadata_df = pd.DataFrame([metadata])
st.table(metadata_df)
else:
st.error("Please upload a PDF file to analyze.")