<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>Forem: kasi viswanath vandanapu</title>
    <description>The latest articles on Forem by kasi viswanath vandanapu (@kasi_viswanath).</description>
    <link>https://forem.com/kasi_viswanath</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3776421%2Ff779b863-7e59-4087-b9a0-bf58c25bb092.jpg</url>
      <title>Forem: kasi viswanath vandanapu</title>
      <link>https://forem.com/kasi_viswanath</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://forem.com/feed/kasi_viswanath"/>
    <language>en</language>
    <item>
      <title>SQL Comparison Library Architecture</title>
      <dc:creator>kasi viswanath vandanapu</dc:creator>
      <pubDate>Wed, 01 Apr 2026 03:58:10 +0000</pubDate>
      <link>https://forem.com/kasi_viswanath/sql-comparison-library-architecture-3mff</link>
      <guid>https://forem.com/kasi_viswanath/sql-comparison-library-architecture-3mff</guid>
      <description>&lt;h2&gt;
  
  
  Purpose
&lt;/h2&gt;

&lt;p&gt;Design a deterministic-first library that compares SQL and results on one database instance, explains mismatches, and optionally adds AI judgment.&lt;/p&gt;

&lt;p&gt;The library evaluates:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;user_query&lt;/li&gt;
&lt;li&gt;actual_sql&lt;/li&gt;
&lt;li&gt;expected_sql&lt;/li&gt;
&lt;li&gt;actual_result&lt;/li&gt;
&lt;li&gt;expected_result&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;The library does not claim universal semantic equivalence. It reports behavior on the evaluated database context.&lt;/p&gt;

&lt;h2&gt;
  
  
  Design Principles
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;Deterministic before AI: deterministic metrics are the source of truth; AI is advisory.&lt;/li&gt;
&lt;li&gt;Structure and result are separate: SQL form and output correctness are scored independently.&lt;/li&gt;
&lt;li&gt;Diagnostics over raw score: every important score must include a reason and evidence.&lt;/li&gt;
&lt;li&gt;Configurable semantics: set, multiset, ordered, and numeric tolerance modes are first-class.&lt;/li&gt;
&lt;li&gt;Production-safe outputs: return machine-friendly schema and human-readable explanations.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  High-Level Architecture
&lt;/h2&gt;

&lt;p&gt;The system is organized into five layers:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Validation&lt;/li&gt;
&lt;li&gt;Structural comparison&lt;/li&gt;
&lt;li&gt;Result comparison&lt;/li&gt;
&lt;li&gt;Diagnostic attribution&lt;/li&gt;
&lt;li&gt;Optional AI judge
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌─────────────────────────────────────────────────────────────────────┐
│  INPUT                                                              │
│  user_query · actual_sql · expected_sql                             │
│  actual_result · expected_result                                    │
└──────────────────────────┬──────────────────────────────────────────┘
                           │
                           ▼
┌─────────────────────────────────────────────────────────────────────┐
│  LAYER 1 · Validation                                               │
│  ├── Parse actual_sql → AST                                         │
│  ├── Parse expected_sql → AST                                       │
│  ├── Canonicalize SQL strings                                       │
│  ├── Execute both queries in a read-only sandbox                    │
│  ├── Capture DML errors · wall-clock timings · schemas              │
│  └── Materialize result sets                                        │
└───────────┬─────────────────────────────┬───────────────────────────┘
            │ parse + exec ok                 │ parse or exec fail
            ▼                                 │
┌───────────────────────────────────────────╒─────────────────────────┐
│  LAYER 2 · Structural Comparison           ┃                         │
│  ├── normalized_sql_match  [boolean gate]   ┃                         │
│  ├── clause_match          [diagnostic]     ┃                         │
│  └── clause_weighted_distance  [score]      ┃                         │
└───────────┬────────────────────────────────┘                         │
            ▼                                                           │
┌─────────────────────────────────────────────────────────────────────┤
│  LAYER 3 · Result Comparison                                         │
│  ├── result_equality_family  (exact / order-insensitive / ordered)   │
│  ├── schema_match                                                    │
│  ├── row_overlap · cell_overlap  [graded quality]                    │
│  ├── cardinality_match                                               │
│  ├── numeric_tolerance_match  (atol / rtol)                          │
│  └── null_handling_match  (3VL)                                      │
└───────────┬─────────────────────────────────────────────────────────┤
            ▼                                                           │
┌─────────────────────────────────────────────────────────────────────┤
│  LAYER 4 · Diagnostic Attribution                                    │
│  ├── projection / filter / join error scores                         │
│  ├── grouping / aggregate function error scores                      │
│  ├── ordering flag · top-k score · cardinality explosion flag        │
│  ├── per_column_mismatch_map  [explanation layer]                    │
│  └── confidence · evidence_strength · ambiguous_case                │
└───────────┬─────────────────────────────────────────────────────────┤
            ▼                                                           │
┌─────────────────────────────────────────────────────────────────────┤
│  LAYER 5 · Optional AI Judge                                         │
│  ├── intent_adequacy_judgment                                        │
│  ├── database_scoped_equivalence_judgment                            │
│  ├── acceptable_deviation_judgment                                   │
│  ├── short_rationale · evidence_bullets · decision_summary           │
│  ├── severity_classification                                         │
│  └── confidence · evidence_strength · ambiguous_case                │
└───────────┬─────────────────────────────────────────────────────────┘
            │─────────────────────────────► parse or exec fail path ►┘
                           │
                           ▼
┌─────────────────────────────────────────────────────────────────────┐
│  FINAL OUTPUT                                                        │
│  deterministic_verdict · ai_advisory_verdict                        │
│  verdict_consistency_flag · overall_score · severity                │
│  structure_score · result_score · diagnostic_scores                 │
│  explanations · mismatch_map · warnings                             │
└─────────────────────────────────────────────────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  1) Validation Layer
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Validate the request payload and runtime config.&lt;/li&gt;
&lt;li&gt;Parse actual_sql and expected_sql into ASTs using a dialect-aware SQL parser.&lt;/li&gt;
&lt;li&gt;Canonicalize SQL strings for structural comparison.&lt;/li&gt;
&lt;li&gt;Execute both queries in a read-only execution sandbox.&lt;/li&gt;
&lt;li&gt;Capture DML runtime errors, wall-clock timings, materialized result sets, and output relation schemas.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Output type: booleans + errors&lt;/p&gt;

&lt;p&gt;Internal intermediates vs external outputs:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;ast_actual, ast_expected, actual_result_table, expected_result_table, actual_schema, and expected_schema are internal intermediates consumed by downstream layers. They are not included in the final report unless explicitly requested via config.&lt;/li&gt;
&lt;li&gt;All other Validation outputs (parse_success, execution_success, execution_error, execution_time, normalized_sql) are external-facing and appear in the final report.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Primary outputs:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;parse_success_actual: True if actual_sql produces a valid AST with no syntax errors.&lt;/li&gt;
&lt;li&gt;parse_success_expected: True if expected_sql produces a valid AST with no syntax errors.&lt;/li&gt;
&lt;li&gt;normalized_sql_actual: Canonicalized SQL string of actual_sql after normalization.&lt;/li&gt;
&lt;li&gt;normalized_sql_expected: Canonicalized SQL string of expected_sql after normalization.&lt;/li&gt;
&lt;li&gt;ast_actual: Abstract Syntax Tree (AST) of actual_sql.&lt;/li&gt;
&lt;li&gt;ast_expected: Abstract Syntax Tree (AST) of expected_sql.&lt;/li&gt;
&lt;li&gt;execution_success_actual: True if actual_sql executes without a DML runtime error.&lt;/li&gt;
&lt;li&gt;execution_success_expected: True if expected_sql executes without a DML runtime error.&lt;/li&gt;
&lt;li&gt;execution_error_actual: Error taxonomy category + raw DB error message for actual_sql, if any.&lt;/li&gt;
&lt;li&gt;execution_error_expected: Error taxonomy category + raw DB error message for expected_sql, if any.&lt;/li&gt;
&lt;li&gt;execution_time_actual_ms: Wall-clock execution time of actual_sql in milliseconds.&lt;/li&gt;
&lt;li&gt;execution_time_expected_ms: Wall-clock execution time of expected_sql in milliseconds.&lt;/li&gt;
&lt;li&gt;actual_result_table: Materialized result set (tuples) returned by actual_sql.&lt;/li&gt;
&lt;li&gt;expected_result_table: Materialized result set (tuples) returned by expected_sql.&lt;/li&gt;
&lt;li&gt;actual_schema: Output relation schema of actual_sql — attribute names and data types.&lt;/li&gt;
&lt;li&gt;expected_schema: Output relation schema of expected_sql — attribute names and data types.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  2) Structural Comparison Layer
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Compare canonicalized SQL strings as a fast boolean gate.&lt;/li&gt;
&lt;li&gt;Perform per-clause AST comparison across all SQL clauses.&lt;/li&gt;
&lt;li&gt;Compute a clause-weighted structural divergence score.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Output type: scores + clause diffs&lt;/p&gt;

&lt;p&gt;Metric role hierarchy:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;normalized_sql_match: fast boolean gate — if true, deeper structural comparison can be skipped.&lt;/li&gt;
&lt;li&gt;clause_match: diagnostic detail — reveals per-clause divergence regardless of string match outcome.&lt;/li&gt;
&lt;li&gt;clause_weighted_distance: structural score — aggregated from per-clause weighted comparison.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Primary outputs:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;normalized_sql_match: Boolean gate — True if canonicalized SQL strings are identical after normalization.&lt;/li&gt;
&lt;li&gt;clause_match: Per-clause AST comparison across SELECT (projection), WHERE (selection predicate), HAVING (post-aggregation predicate), JOIN (join graph and join type), GROUP BY (aggregation grain), ORDER BY, LIMIT/OFFSET (top-k), DISTINCT, WINDOW/OVER, and set operators (UNION/INTERSECT/EXCEPT).&lt;/li&gt;
&lt;li&gt;clause_weighted_distance: Clause-weighted structural divergence score; 0 = identical, higher = more divergent.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  3) Result Comparison Layer
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Compare materialized result sets under configurable set, bag (multiset), or ordered semantics.&lt;/li&gt;
&lt;li&gt;Evaluate relation schema alignment, graded tuple-level and cell-level overlap, cardinality delta, three-valued logic (3VL) NULL consistency, and numeric tolerance.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Output type: scores + mismatch maps&lt;/p&gt;

&lt;p&gt;Result equality family:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Use one metric family, result_equality_family, with an explicit comparison_mode.&lt;/li&gt;
&lt;li&gt;comparison_mode selects evaluation semantics: exact / order-insensitive / order-sensitive.&lt;/li&gt;
&lt;li&gt;result_equality_family.mode_pass is the single boolean pass/fail for the selected mode.&lt;/li&gt;
&lt;li&gt;Optional mode_details can include non-primary mode outcomes for debugging, but they do not act as independent peer metrics.&lt;/li&gt;
&lt;li&gt;row_overlap and cell_overlap are graded quality metrics measuring partial match degree.&lt;/li&gt;
&lt;li&gt;per_column_mismatch_map is the explanation layer and lives in Layer 4 Diagnostic Attribution.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Primary outputs:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;result_equality_family: Mode-driven result comparison object:

&lt;ul&gt;
&lt;li&gt;comparison_mode: exact | order-insensitive | order-sensitive.&lt;/li&gt;
&lt;li&gt;mode_pass: Boolean verdict for the selected comparison_mode.&lt;/li&gt;
&lt;li&gt;mode_details (optional): non-primary mode outcomes retained for diagnostics only.&lt;/li&gt;
&lt;/ul&gt;


&lt;/li&gt;

&lt;li&gt;schema_match: Relation schema comparison — attribute count, attribute names, data types, and optional attribute ordering.&lt;/li&gt;

&lt;li&gt;row_overlap: Graded quality — tuple-level overlap expressed as Jaccard similarity or F1 score (precision + recall over shared tuples).&lt;/li&gt;

&lt;li&gt;cell_overlap: Graded quality — value-level (attribute × tuple) overlap after alignment; measures localized mismatch granularity.&lt;/li&gt;

&lt;li&gt;cardinality_match: Cardinality comparison — absolute row-count delta and relative cardinality ratio between result sets.&lt;/li&gt;

&lt;li&gt;numeric_tolerance_match: Floating-point and decimal comparison using atol and rtol thresholds.&lt;/li&gt;

&lt;li&gt;null_handling_match: True if NULL placement and three-valued logic (3VL) behavior is consistent across result sets.&lt;/li&gt;

&lt;/ul&gt;

&lt;h3&gt;
  
  
  4) Diagnostic Attribution Layer
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Attribute result mismatch to likely root SQL clauses using heuristic diagnostic scoring.&lt;/li&gt;
&lt;li&gt;Emit clause-level error scores with confidence and evidence strength.&lt;/li&gt;
&lt;li&gt;Detect ordering-only mismatch and cardinality fanout or compression anomalies.&lt;/li&gt;
&lt;li&gt;Produce the per-attribute mismatch explanation map.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Output type: error categories + evidence&lt;/p&gt;

&lt;p&gt;Every diagnostic output includes: likely source clause, confidence (0.0–1.0), evidence_strength (low / medium / high), and ambiguous_case flag when multiple root clauses are plausible.&lt;/p&gt;

&lt;p&gt;Primary outputs:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;projection_error_score: Estimated impact of differences in the SELECT projection list — missing, extra, or transformed attributes.&lt;/li&gt;
&lt;li&gt;filter_error_score: Estimated impact of predicate differences in WHERE (selection) and HAVING (post-aggregation filter).&lt;/li&gt;
&lt;li&gt;join_error_score: Estimated impact of join graph topology, join predicate (ON condition), or join type differences (INNER / LEFT / RIGHT / FULL / CROSS / ANTI / SEMI).&lt;/li&gt;
&lt;li&gt;grouping_error_score: Estimated impact of incorrect aggregation grain in the GROUP BY clause.&lt;/li&gt;
&lt;li&gt;aggregate_function_error_score: Estimated impact of wrong aggregate function (SUM vs COUNT vs AVG vs MIN vs MAX vs STDDEV).&lt;/li&gt;
&lt;li&gt;ordering_error_flag: True when mismatch is caused solely by tuple ordering; bag-semantics result sets are equal.&lt;/li&gt;
&lt;li&gt;limit_topk_error_score: Estimated impact of LIMIT, TOP, FETCH FIRST, OFFSET, or ROW_NUMBER top-k differences.&lt;/li&gt;
&lt;li&gt;cardinality_explosion_flag: True when result set cardinality deviates significantly from expected due to join fanout or over-filtering.&lt;/li&gt;
&lt;li&gt;per_column_mismatch_map: Explanation layer — attribute-level mismatch rate and severity breakdown explaining row_overlap and cell_overlap.&lt;/li&gt;
&lt;li&gt;confidence: 0.0–1.0 confidence in the primary diagnostic attribution.&lt;/li&gt;
&lt;li&gt;evidence_strength: low / medium / high — based on corroborating signal count and quality.&lt;/li&gt;
&lt;li&gt;ambiguous_case: True when multiple clauses are plausible root causes with similar evidence weight.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  5) Optional AI Judge Layer
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Apply LLM-as-judge evaluation grounded strictly in deterministic metric outputs.&lt;/li&gt;
&lt;li&gt;Assess natural-language intent adequacy relative to the user query.&lt;/li&gt;
&lt;li&gt;Judge database-scoped equivalence as observed on this database instance — not universal semantic equivalence.&lt;/li&gt;
&lt;li&gt;Classify deviation acceptability using domain and business context.&lt;/li&gt;
&lt;li&gt;Never issue a verdict that contradicts the deterministic verdict from Layers 1–4.&lt;/li&gt;
&lt;li&gt;Consume the optional &lt;code&gt;context&lt;/code&gt; field from the Request Model (schema metadata and business constraints) to ground judgment in domain-specific rules.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Output type: advisory label + rationale&lt;/p&gt;

&lt;p&gt;Verdict structure — the final report separates three fields:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;deterministic_verdict: authoritative pass/fail derived from Layers 1–4.&lt;/li&gt;
&lt;li&gt;ai_advisory_verdict: the AI judge's advisory classification.&lt;/li&gt;
&lt;li&gt;verdict_consistency_flag: True if both verdicts agree; False when they diverge, which triggers a consistency warning.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;AI output format — internal reasoning traces are never exposed; instead:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;short_rationale: one-to-two sentence summary of the AI judge's reasoning.&lt;/li&gt;
&lt;li&gt;evidence_bullets: ordered list of key deterministic signals the AI judge used.&lt;/li&gt;
&lt;li&gt;decision_summary: final label and justification only.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Primary outputs:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;intent_adequacy_judgment: LLM-as-judge assessment of whether actual_sql satisfies user_query intent as well as expected_sql.&lt;/li&gt;
&lt;li&gt;database_scoped_equivalence_judgment: LLM assessment of observed equivalence on this database instance for the given task — does not claim universal semantic equivalence.&lt;/li&gt;
&lt;li&gt;acceptable_deviation_judgment: Grounded classification of whether mismatch is harmless (e.g., ordering-only), acceptable (minor rounding), or critical (wrong data).&lt;/li&gt;
&lt;li&gt;short_rationale: One-to-two sentence summary of the AI judge's reasoning.&lt;/li&gt;
&lt;li&gt;evidence_bullets: Ordered list of key deterministic signals used in the AI judgment.&lt;/li&gt;
&lt;li&gt;decision_summary: Final label and justification without internal reasoning traces.&lt;/li&gt;
&lt;li&gt;severity_classification: Severity triage label: pass / minor issue / moderate issue / major issue / critical failure.&lt;/li&gt;
&lt;li&gt;confidence: 0.0–1.0 confidence in the AI judgment.&lt;/li&gt;
&lt;li&gt;evidence_strength: low / medium / high.&lt;/li&gt;
&lt;li&gt;ambiguous_case: True when the AI judge cannot distinguish between two plausible classifications.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Core Components
&lt;/h2&gt;

&lt;h3&gt;
  
  
  A. Request Model
&lt;/h3&gt;

&lt;p&gt;Input contract for one evaluation run.&lt;/p&gt;

&lt;p&gt;Suggested fields:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;user_query: string&lt;/li&gt;
&lt;li&gt;actual_sql: string&lt;/li&gt;
&lt;li&gt;expected_sql: string&lt;/li&gt;
&lt;li&gt;config: evaluation options&lt;/li&gt;
&lt;li&gt;context: optional schema metadata and business constraints&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  B. Execution Adapter
&lt;/h3&gt;

&lt;p&gt;DB adapter abstraction to support one configured database engine per run.&lt;/p&gt;

&lt;p&gt;Interface shape:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;execute(sql) -&amp;gt; execution metadata + materialized result&lt;/li&gt;
&lt;li&gt;explain(sql) -&amp;gt; optional plan metadata&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Sandbox isolation:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Queries run inside a read-only transaction (or equivalent read-only session mode) to prevent side-effects.&lt;/li&gt;
&lt;li&gt;A configurable query timeout aborts long-running executions.&lt;/li&gt;
&lt;li&gt;On completion or timeout, the transaction is rolled back to ensure no persistent state changes.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  C. SQL Parser and Normalizer
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Parse SQL into an Abstract Syntax Tree (AST) using a dialect-aware parser (e.g., sqlglot).&lt;/li&gt;
&lt;li&gt;Canonicalize SQL string: lowercase keywords, collapse whitespace, normalize alias formatting.&lt;/li&gt;
&lt;li&gt;Normalize commutative expressions (e.g., AND operand ordering) where semantics are preserved.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  D. Structural Comparator
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Per-clause AST matcher across projection (SELECT), selection predicate (WHERE), join graph, aggregation grain (GROUP BY), post-aggregation predicate (HAVING), ordering (ORDER BY), top-k (LIMIT/OFFSET), and set operators.&lt;/li&gt;
&lt;li&gt;Clause-weighted distance calculator producing a normalized structural divergence score.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  E. Result Comparator
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Relation schema comparator (attribute names, data types, attribute order).&lt;/li&gt;
&lt;li&gt;Ordered and unordered tuple comparators for set and bag (multiset) semantics.&lt;/li&gt;
&lt;li&gt;Cell-level (attribute × tuple) alignment and value matching.&lt;/li&gt;
&lt;li&gt;Floating-point comparator using atol/rtol tolerance thresholds.&lt;/li&gt;
&lt;li&gt;NULL-aware comparator respecting three-valued logic (3VL) semantics.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  F. Diagnostic Engine
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Convert raw structural and result divergence signals into clause-attributed error scores.&lt;/li&gt;
&lt;li&gt;Build per-attribute mismatch profiles from the output relation.&lt;/li&gt;
&lt;li&gt;Generate deterministic, evidence-backed explanations without LLM involvement.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  G. Scoring and Severity Engine
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Compute category scores.&lt;/li&gt;
&lt;li&gt;Aggregate weighted overall score.&lt;/li&gt;
&lt;li&gt;Map to severity labels.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  H. Report Builder
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Emit final output schema.&lt;/li&gt;
&lt;li&gt;Include evidence, warnings, and plain-language explanations.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  I. Optional AI Judge Adapter
&lt;/h3&gt;

&lt;p&gt;Responsibilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Construct grounded LLM-as-judge prompts that embed deterministic metric evidence.&lt;/li&gt;
&lt;li&gt;Return structured AI judgments with confidence scores.&lt;/li&gt;
&lt;li&gt;Record model identifier, prompt version, and token usage for reproducibility.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Metric Processing Pipeline
&lt;/h2&gt;

&lt;p&gt;Recommended sequence:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Validation (parse checks + execution)&lt;/li&gt;
&lt;li&gt;Structural comparison metrics&lt;/li&gt;
&lt;li&gt;Result comparison metrics&lt;/li&gt;
&lt;li&gt;Diagnostic attribution metrics (computed after structural and result evidence is available)&lt;/li&gt;
&lt;li&gt;Optional performance metrics&lt;/li&gt;
&lt;li&gt;Optional AI judge metrics&lt;/li&gt;
&lt;li&gt;Final synthesis — compute deterministic_verdict, run AI judge, emit verdict_consistency_flag&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;If either query fails parse or execution, continue producing partial diagnostics where possible and return explicit blocked-reason fields.&lt;/p&gt;

&lt;h3&gt;
  
  
  Failure-State Precedence Rules
&lt;/h3&gt;

&lt;p&gt;Apply deterministic precedence in this exact order during final synthesis:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Parse failure precedence: if parse_success_actual = false or parse_success_expected = false, set deterministic_verdict = fail, set severity = critical failure, and mark downstream structural/result comparisons as blocked.&lt;/li&gt;
&lt;li&gt;Execution failure precedence: if both parse checks pass but execution_success_actual = false or execution_success_expected = false, set deterministic_verdict = fail, set severity = critical failure, and mark result comparison fields as blocked.&lt;/li&gt;
&lt;li&gt;Result-family precedence: if validation passes, evaluate result_equality_family.mode_pass for the configured comparison_mode. A false mode_pass sets deterministic_verdict = fail regardless of advisory AI output.&lt;/li&gt;
&lt;li&gt;Structural/diagnostic precedence: when result equality fails, use structure and diagnostic signals only for attribution and severity refinement; they never override deterministic fail to pass.&lt;/li&gt;
&lt;li&gt;Advisory precedence: ai_advisory_verdict can only annotate. It cannot overturn deterministic_verdict in any failure state.&lt;/li&gt;
&lt;li&gt;Performance metric precedence: if execution_success_actual = false or execution_success_expected = false, performance metrics (execution_time_ratio, resource_cost_score, plan_complexity_score) are blocked and set to null.&lt;/li&gt;
&lt;/ol&gt;

&lt;h2&gt;
  
  
  Configuration Model
&lt;/h2&gt;

&lt;p&gt;Key runtime options:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;comparison_mode: exact / order-insensitive / order-sensitive&lt;/li&gt;
&lt;li&gt;numeric_abs_tolerance: float&lt;/li&gt;
&lt;li&gt;numeric_rel_tolerance: float&lt;/li&gt;
&lt;li&gt;null_equality_mode: strict/sql_semantic&lt;/li&gt;
&lt;li&gt;require_column_name_match: true/false&lt;/li&gt;
&lt;li&gt;require_column_order_match: true/false&lt;/li&gt;
&lt;li&gt;max_rows_for_cell_alignment: integer&lt;/li&gt;
&lt;li&gt;query_timeout_ms: integer (maximum wall-clock time per query execution before abort)&lt;/li&gt;
&lt;li&gt;enable_ai_judge: true/false&lt;/li&gt;
&lt;li&gt;enable_performance_metrics: true/false&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Comparison mode mapping:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;exact: schema, values, and row order must all match.&lt;/li&gt;
&lt;li&gt;order-insensitive: bag (multiset) semantics — duplicates count, row order ignored.&lt;/li&gt;
&lt;li&gt;order-sensitive: full ordered comparison including tuple sequence; required for top-k and pagination queries.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Suggested Output Contract
&lt;/h2&gt;

&lt;p&gt;Top-level fields:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;deterministic_verdict: authoritative pass/fail derived from Layers 1–4.&lt;/li&gt;
&lt;li&gt;ai_advisory_verdict: advisory classification from the AI judge (optional).&lt;/li&gt;
&lt;li&gt;verdict_consistency_flag: True if both verdicts agree; False triggers a consistency warning.&lt;/li&gt;
&lt;li&gt;overall_score: 0.0 when deterministic_verdict is fail due to parse or execution failure; weighted aggregate otherwise.&lt;/li&gt;
&lt;li&gt;structure_score&lt;/li&gt;
&lt;li&gt;result_score&lt;/li&gt;
&lt;li&gt;diagnostic_scores&lt;/li&gt;
&lt;li&gt;performance_scores&lt;/li&gt;
&lt;li&gt;severity&lt;/li&gt;
&lt;li&gt;explanations&lt;/li&gt;
&lt;li&gt;mismatch_map&lt;/li&gt;
&lt;li&gt;error_types: list of Error Taxonomy categories triggered during execution (maps to the DML error taxonomy).&lt;/li&gt;
&lt;li&gt;blocked_reason: present when downstream layers are blocked by an upstream failure; indicates which precedence rule triggered the block (e.g., "parse_failure" or "execution_failure").&lt;/li&gt;
&lt;li&gt;warnings&lt;/li&gt;
&lt;li&gt;validity: parse and execution outcomes&lt;/li&gt;
&lt;li&gt;evidence: clause-level and attribute-level details&lt;/li&gt;
&lt;li&gt;run_metadata: timings, db identifier, version, config hash&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Scoring Strategy
&lt;/h2&gt;

&lt;p&gt;Use weighted aggregation with explicit defaults:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;validity gate: parse or execution failure sets deterministic_verdict to fail and overall_score to 0.0 before any weighted scoring runs.&lt;/li&gt;
&lt;li&gt;result gate: result_equality_family.mode_pass acts as a boolean gate — a false mode_pass sets deterministic_verdict to fail. It does not contribute a numeric weight.&lt;/li&gt;
&lt;li&gt;structure_score: weighted blend of normalized match, clause_match, and derived clause-weighted similarity (computed from clause_weighted_distance at scoring time).&lt;/li&gt;
&lt;li&gt;result_score: weighted blend of schema, overlap, cardinality, tolerance, and NULL handling (mode_pass is the gate, not a blend input).&lt;/li&gt;
&lt;li&gt;diagnostic_scores: not just penalties; include confidence and evidence count.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Derived metric note:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;clause_weighted_similarity is not stored as a primary output; it is derived when needed from clause_weighted_distance.&lt;/li&gt;
&lt;li&gt;execution_time_ratio is derived from execution_time_actual_ms / execution_time_expected_ms and is not independently stored in Validation.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Severity mapping example:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;pass&lt;/li&gt;
&lt;li&gt;minor issue&lt;/li&gt;
&lt;li&gt;moderate issue&lt;/li&gt;
&lt;li&gt;major issue&lt;/li&gt;
&lt;li&gt;critical failure&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Error Taxonomy
&lt;/h2&gt;

&lt;p&gt;DML runtime execution error categories:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;missing_table: Referenced relation does not exist in the database catalog.&lt;/li&gt;
&lt;li&gt;missing_column: Referenced attribute does not exist in the relation schema.&lt;/li&gt;
&lt;li&gt;type_mismatch: Operand data types are incompatible for the operation or predicate.&lt;/li&gt;
&lt;li&gt;division_by_zero: Arithmetic expression produces a division-by-zero runtime error.&lt;/li&gt;
&lt;li&gt;invalid_aggregation: Aggregate function used in an invalid context (e.g., non-grouped attribute in SELECT without GROUP BY).&lt;/li&gt;
&lt;li&gt;ambiguous_reference: Attribute reference is ambiguous across joined relations without qualification.&lt;/li&gt;
&lt;li&gt;permission_error: Executing principal lacks read permission on the referenced relation or schema.&lt;/li&gt;
&lt;li&gt;unknown_error: Uncategorized runtime error; preserve raw DB error message for inspection.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Always store both the mapped taxonomy category and the raw database error message.&lt;/p&gt;

&lt;h2&gt;
  
  
  Deterministic Diagnostics Strategy
&lt;/h2&gt;

&lt;p&gt;Detailed heuristic rules for the Diagnostic Engine (Component F). Layer 4 defines the output schema; this section defines the attribution logic.&lt;/p&gt;

&lt;p&gt;Heuristic clause-attribution signals:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Projection mismatch: output relation schema or SELECT expression differs while tuple cardinality is close.&lt;/li&gt;
&lt;li&gt;Predicate (filter) mismatch: high tuple overlap asymmetry correlated with WHERE/HAVING predicate divergence in the AST.&lt;/li&gt;
&lt;li&gt;Join mismatch: cardinality fanout or compression combined with join graph topology or join predicate divergence.&lt;/li&gt;
&lt;li&gt;Aggregation grain mismatch: aggregate score drift paired with incorrect or missing GROUP BY attributes.&lt;/li&gt;
&lt;li&gt;Ordering-only mismatch: bag-semantics result sets are equal but ordered result sets differ — ordering_error_flag is set.&lt;/li&gt;
&lt;li&gt;Top-k mismatch: result set divergence concentrated near the LIMIT/OFFSET truncation boundary.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Every diagnostic output must include:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;what differed (clause and attribute level)&lt;/li&gt;
&lt;li&gt;likely source clause (projection / selection predicate / join / aggregation grain / ordering / top-k)&lt;/li&gt;
&lt;li&gt;confidence score&lt;/li&gt;
&lt;li&gt;supporting evidence (cardinality delta, shared attribute overlap, predicate diff summary)&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Performance and Explainability
&lt;/h2&gt;

&lt;p&gt;Optional performance metrics should be collected only when stable and comparable:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;execution_time_ratio&lt;/li&gt;
&lt;li&gt;resource_cost_score&lt;/li&gt;
&lt;li&gt;plan_complexity_score&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Include a warning when noisy conditions reduce comparability (cold cache, variable load, missing plan stats).&lt;/p&gt;

&lt;h2&gt;
  
  
  AI Judge Guardrails
&lt;/h2&gt;

&lt;p&gt;Normative rules for AI judge behavior. Verdict structure and output format are defined in Layer 5.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;AI judge is optional and additive.&lt;/li&gt;
&lt;li&gt;AI output is always advisory — deterministic_verdict from Layers 1–4 takes precedence.&lt;/li&gt;
&lt;li&gt;AI cannot flip deterministic pass/fail flags.&lt;/li&gt;
&lt;li&gt;When ai_advisory_verdict disagrees with deterministic_verdict, verdict_consistency_flag is set to False and a consistency warning is included in the report.&lt;/li&gt;
&lt;li&gt;AI responses expose only: short_rationale, evidence_bullets, decision_summary. No internal reasoning trace is returned.&lt;/li&gt;
&lt;li&gt;Include model identifier, prompt version, and confidence score.&lt;/li&gt;
&lt;li&gt;Support policy-based disablement for production pipelines.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Implementation Roadmap
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Phase 1: Core MVP
&lt;/h3&gt;

&lt;p&gt;Build first:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;parse success&lt;/li&gt;
&lt;li&gt;execution success and error type&lt;/li&gt;
&lt;li&gt;normalized SQL match&lt;/li&gt;
&lt;li&gt;clause match&lt;/li&gt;
&lt;li&gt;result_equality_family (mode-driven)&lt;/li&gt;
&lt;li&gt;schema match&lt;/li&gt;
&lt;li&gt;row overlap&lt;/li&gt;
&lt;li&gt;cardinality match&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Phase 2: Debugging Expansion
&lt;/h3&gt;

&lt;p&gt;Add:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;projection/filter/join/grouping/aggregate diagnostic scores&lt;/li&gt;
&lt;li&gt;ordering-only mismatch flag&lt;/li&gt;
&lt;li&gt;per-column mismatch map&lt;/li&gt;
&lt;li&gt;numeric tolerance match&lt;/li&gt;
&lt;li&gt;NULL handling match&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Phase 3: Production and Research Features
&lt;/h3&gt;

&lt;p&gt;Add:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;performance metrics&lt;/li&gt;
&lt;li&gt;AI judge metrics&lt;/li&gt;
&lt;li&gt;richer severity policy profiles&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Metrics Glossary
&lt;/h2&gt;

&lt;p&gt;The Metrics Glossary is the normative reference for metric names, types, and descriptions. Layer sections provide architectural context and processing logic.&lt;/p&gt;

&lt;h3&gt;
  
  
  Validation Metrics
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Layer&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;parse_success_actual&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True if actual_sql produces a valid AST with no syntax errors&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;parse_success_expected&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True if expected_sql produces a valid AST with no syntax errors&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;normalized_sql_actual&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;string&lt;/td&gt;
&lt;td&gt;Canonicalized SQL string of actual_sql after normalization&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;normalized_sql_expected&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;string&lt;/td&gt;
&lt;td&gt;Canonicalized SQL string of expected_sql after normalization&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ast_actual&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;AST&lt;/td&gt;
&lt;td&gt;Abstract Syntax Tree (AST) of actual_sql&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ast_expected&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;AST&lt;/td&gt;
&lt;td&gt;Abstract Syntax Tree (AST) of expected_sql&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;execution_success_actual&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True if actual_sql executes without a DML runtime error&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;execution_success_expected&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True if expected_sql executes without a DML runtime error&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;execution_error_actual&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Error taxonomy category + raw DB error message for actual_sql&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;execution_error_expected&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Error taxonomy category + raw DB error message for expected_sql&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;execution_time_actual_ms&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Wall-clock runtime of actual_sql in milliseconds&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;execution_time_expected_ms&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Wall-clock runtime of expected_sql in milliseconds&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;actual_result_table&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;relation&lt;/td&gt;
&lt;td&gt;Materialized result set (tuples) returned by actual_sql&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;expected_result_table&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;relation&lt;/td&gt;
&lt;td&gt;Materialized result set (tuples) returned by expected_sql&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;actual_schema&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Output relation schema of actual_sql - attribute names and data types&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;expected_schema&lt;/td&gt;
&lt;td&gt;Validation&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Output relation schema of expected_sql - attribute names and data types&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Structural Comparison Metrics
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Role&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;normalized_sql_match&lt;/td&gt;
&lt;td&gt;Boolean gate&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True if canonicalized SQL strings are identical; if true, deeper structural comparison can be skipped&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;clause_match&lt;/td&gt;
&lt;td&gt;Diagnostic detail&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Per-clause AST comparison: projection (SELECT), selection predicate (WHERE), post-aggregation predicate (HAVING), join graph and join type, aggregation grain (GROUP BY), ordering (ORDER BY), top-k (LIMIT/OFFSET), set operators (UNION/INTERSECT/EXCEPT)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;clause_weighted_distance&lt;/td&gt;
&lt;td&gt;Structural score&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Clause-weighted structural divergence; 0 = identical, higher = more divergent&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Diagnostic Attribution Metrics
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Layer&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;projection_error_score&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Estimated impact from differences in the SELECT projection list - missing, extra, or transformed attributes&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;filter_error_score&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Estimated impact from predicate differences in WHERE (selection) and HAVING (post-aggregation filter)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;join_error_score&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Estimated impact from join graph topology, join predicate (ON condition), or join type (INNER / LEFT / RIGHT / FULL / CROSS / ANTI / SEMI)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;grouping_error_score&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Estimated impact from incorrect aggregation grain in the GROUP BY clause&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;aggregate_function_error_score&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Estimated impact from wrong aggregate function (SUM vs COUNT vs AVG vs MIN vs MAX vs STDDEV)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ordering_error_flag&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True when mismatch is caused solely by tuple ordering; bag-semantics result sets are equal&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;limit_topk_error_score&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Estimated impact from LIMIT, TOP, FETCH FIRST, OFFSET, or ROW_NUMBER top-k differences&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;cardinality_explosion_flag&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True when result set cardinality deviates significantly from expected due to join fanout or over-filtering&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;per_column_mismatch_map&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Explanation layer - attribute-level mismatch rate and severity breakdown explaining row_overlap and cell_overlap&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;confidence&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;0.0-1.0 confidence in the primary diagnostic attribution&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;evidence_strength&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;low / medium / high - based on corroborating signal count and quality&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ambiguous_case&lt;/td&gt;
&lt;td&gt;Diagnostic&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True when multiple clauses are plausible root causes with similar evidence weight&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Result Comparison Metrics
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Role&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;result_equality_family&lt;/td&gt;
&lt;td&gt;Equality family&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Single mode-driven equality family with comparison_mode (exact / order-insensitive / order-sensitive), mode_pass, and optional mode_details for diagnostics&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;schema_match&lt;/td&gt;
&lt;td&gt;Schema&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Relation schema comparison: attribute count, attribute names, data types, and optional attribute ordering&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;row_overlap&lt;/td&gt;
&lt;td&gt;Graded quality&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Tuple-level overlap expressed as Jaccard similarity or F1 score (precision + recall over shared tuples)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;cell_overlap&lt;/td&gt;
&lt;td&gt;Graded quality&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Value-level (attribute x tuple) overlap after alignment; measures localized mismatch granularity&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;cardinality_match&lt;/td&gt;
&lt;td&gt;Cardinality&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Cardinality comparison - absolute row-count delta and relative cardinality ratio between result sets&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;numeric_tolerance_match&lt;/td&gt;
&lt;td&gt;Tolerance&lt;/td&gt;
&lt;td&gt;object&lt;/td&gt;
&lt;td&gt;Floating-point comparison using atol and rtol: match if&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;null_handling_match&lt;/td&gt;
&lt;td&gt;NULL semantics&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True if NULL placement and three-valued logic (3VL) behavior is consistent across result sets&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Optional Performance Metrics
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Layer&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;execution_time_ratio&lt;/td&gt;
&lt;td&gt;Performance&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Derived: ratio of execution_time_actual_ms to execution_time_expected_ms; not independently stored in Validation&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;resource_cost_score&lt;/td&gt;
&lt;td&gt;Performance&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Relative cost estimate from the query optimizer execution plan (EXPLAIN) if available&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;plan_complexity_score&lt;/td&gt;
&lt;td&gt;Performance&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;Weighted count of expensive query plan operators: full scans, hash joins, sorts, nested loops&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Final Output Fields
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Layer&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;deterministic_verdict&lt;/td&gt;
&lt;td&gt;Final output&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;Authoritative pass/fail derived from Layers 1-4 deterministic metrics&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ai_advisory_verdict&lt;/td&gt;
&lt;td&gt;Final output&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;Advisory classification from the AI judge; never overrides deterministic_verdict&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;verdict_consistency_flag&lt;/td&gt;
&lt;td&gt;Final output&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True if both verdicts agree; False triggers a consistency warning&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;blocked_reason&lt;/td&gt;
&lt;td&gt;Final output&lt;/td&gt;
&lt;td&gt;string&lt;/td&gt;
&lt;td&gt;Present when downstream layers are blocked by an upstream failure; indicates the triggering precedence rule&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  AI Judge Metrics
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Layer&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;intent_adequacy_judgment&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;LLM-as-judge assessment of whether actual_sql satisfies user_query intent as well as expected_sql&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;database_scoped_equivalence_judgment&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;LLM assessment of observed equivalence on this database instance - does not claim universal semantic equivalence&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;acceptable_deviation_judgment&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;Grounded classification: harmless (ordering-only), acceptable (minor rounding), or critical (wrong data)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;short_rationale&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;string&lt;/td&gt;
&lt;td&gt;One-to-two sentence summary of the AI judge's reasoning&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;evidence_bullets&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;list&lt;/td&gt;
&lt;td&gt;Ordered list of key deterministic signals used in the AI judgment&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;decision_summary&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;string&lt;/td&gt;
&lt;td&gt;Final label and justification without internal reasoning traces&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;severity_classification&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;Severity triage label: pass / minor issue / moderate issue / major issue / critical failure&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;confidence&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;float&lt;/td&gt;
&lt;td&gt;0.0-1.0 confidence in the AI judgment&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;evidence_strength&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;label&lt;/td&gt;
&lt;td&gt;low / medium / high - based on quality of deterministic evidence available&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ambiguous_case&lt;/td&gt;
&lt;td&gt;AI Judge&lt;/td&gt;
&lt;td&gt;bool&lt;/td&gt;
&lt;td&gt;True when the AI judge cannot distinguish between two plausible classifications&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

</description>
      <category>sql</category>
      <category>ai</category>
      <category>evaluation</category>
      <category>llm</category>
    </item>
    <item>
      <title>Build a Production‑Ready SQL Evaluation Engine for LLMs</title>
      <dc:creator>kasi viswanath vandanapu</dc:creator>
      <pubDate>Mon, 30 Mar 2026 17:35:23 +0000</pubDate>
      <link>https://forem.com/kasi_viswanath/build-a-production-ready-sql-evaluation-engine-for-llms-3e9k</link>
      <guid>https://forem.com/kasi_viswanath/build-a-production-ready-sql-evaluation-engine-for-llms-3e9k</guid>
      <description>&lt;h2&gt;
  
  
  Intro
&lt;/h2&gt;

&lt;p&gt;When I first started building a text‑to‑SQL system, the obvious thing was to run the generated query against a database and compare the result with a ground truth. That worked for a handful of examples, but as soon as we hit hundreds of user queries, the naive approach broke down: it was slow, brittle, and offered no insight into &lt;em&gt;why&lt;/em&gt; a query failed.&lt;/p&gt;

&lt;p&gt;What I needed was a two‑layer engine:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Fast deterministic checks&lt;/strong&gt; that catch the most common mistakes in under a second.&lt;/li&gt;
&lt;li&gt;An &lt;strong&gt;AI judge&lt;/strong&gt; that digs deeper when those checks fail, tells you exactly what’s missing or wrong, and even spits out a corrected SQL snippet.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;Below is my complete, production‑ready framework (no storage, no UI). I’ll walk through the architecture, show you the core code, and explain how to plug it into your own pipeline. By the end, you’ll have a reusable tool that turns every LLM‑generated query into actionable feedback—perfect for continuous model improvement.&lt;/p&gt;




&lt;h2&gt;
  
  
  1. Why Two Layers?
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Layer&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;th&gt;Typical Cost&lt;/th&gt;
&lt;th&gt;Speed&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Deterministic&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Quick sanity checks (row count, column coverage, AST structure)&lt;/td&gt;
&lt;td&gt;Negligible&lt;/td&gt;
&lt;td&gt;&amp;lt; 0.5 s per query&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;AI Judge&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Deep semantic review + actionable suggestions&lt;/td&gt;
&lt;td&gt;Medium (model token cost)&lt;/td&gt;
&lt;td&gt;1–3 s per query&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;The deterministic layer filters out the &lt;em&gt;obvious&lt;/em&gt; failures—missing joins, wrong aggregates, mismatched row counts—so we only pay for the expensive AI pass when something truly needs human‑level reasoning. This keeps overall costs low while still giving you rich diagnostics.&lt;/p&gt;




&lt;h2&gt;
  
  
  2. High‑Level Flow
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;User Query + Expected SQL
          │
          ▼
[Deterministic Evaluator] → scores, verdict
          │
          ├─ If score ≥ 0.92 → return quick summary
          │
          ▼
[AI Judge] (LLM) → detailed JSON: missing elements, root cause, suggested fix
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The evaluator returns a dictionary with metrics like &lt;code&gt;row_count_match&lt;/code&gt;, &lt;code&gt;column_coverage&lt;/code&gt;, and an overall weighted score. If the score is high enough we skip the AI step; otherwise we call the LLM with a carefully crafted prompt that forces it to output structured JSON.&lt;/p&gt;




&lt;h2&gt;
  
  
  3. Core Code
&lt;/h2&gt;

&lt;p&gt;Save this as &lt;strong&gt;&lt;code&gt;sql_eval_framework.py&lt;/code&gt;&lt;/strong&gt;. It contains:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;deterministic_evaluate()&lt;/code&gt; – the fast checks.&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;build_judge_prompt()&lt;/code&gt; – template for the AI step.&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;evaluate_batch()&lt;/code&gt; – async batch runner with concurrency control.&lt;/li&gt;
&lt;/ul&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
python
# sql_eval_framework.py
import asyncio, json
from typing import Dict, Any, List
import pandas as pd
from sqlglot import parse_one, exp
from litellm import acompletion  # pip install litellm
from pydantic import BaseModel, Field

# ------------------------------------------------------------------
# 1️⃣ Deterministic Evaluator (80/20 + extras)
# ------------------------------------------------------------------
def deterministic_evaluate(
    expected_sql: str,
    llm_sql: str,
    expected_df: pd.DataFrame,
    llm_df: pd.DataFrame
) -&amp;gt; Dict[str, Any]:
    """
    Fast checks that run in &amp;lt; 0.5 s per query.
    Returns a dict with scores and a weighted overall score.
    """

    # --- Row count -------------------------------------------------
    row_match = len(expected_df) == len(llm_df)
    row_delta_pct = abs(len(expected_df) - len(llm_df)) / max(len(expected_df), 1)

    # --- Column coverage --------------------------------------------
    exp_cols, llm_cols = set(expected_df.columns), set(llm_df.columns)
    col_coverage = len(exp_cols &amp;amp; llm_cols) / len(exp_cols) if exp_cols else 0
    missing_cols = list(exp_cols - llm_cols)
    extra_cols   = list(llm_cols - exp_cols)

    # --- Result set match (order‑insensitive) -----------------------
    try:
        exp_sorted = expected_df.sort_values(by=list(expected_df.columns)).reset_index(drop=True)
        llm_sorted  = llm_df.sort_values(by=list(llm_df.columns)).reset_index(drop=True)
        result_exact = exp_sorted.equals(llm_sorted)
    except Exception:
        result_exact = False

    # --- Basic AST structural checks --------------------------------
    try:
        ast_exp = parse_one(expected_sql, read="postgres")
        ast_llm = parse_one(llm_sql, read="postgres")

        tables_match   = {t.this for t in ast_exp.find_all(exp.Table)} == \
                         {t.this for t in ast_llm.find_all(exp.Table)}
        where_exists   = bool(ast_exp.args.get("where")) == bool(ast_llm.args.get("where"))
        group_by_match = str(ast_exp.args.get("group")) == str(ast_llm.args.get("group"))
    except Exception:
        tables_match, where_exists, group_by_match = False, False, False

    # --- Weighted score ---------------------------------------------
    weights = {
        "row": 0.15,
        "col": 0.20,
        "result": 0.25,
        "tables": 0.10,
        "where": 0.10,
        "group_by": 0.10
    }
    overall_score = (
        (1 if row_match else 0) * weights["row"] +
        col_coverage * weights["col"] +
        (1 if result_exact else 0) * weights["result"] +
        (1 if tables_match else 0) * weights["tables"] +
        (1 if where_exists else 0) * weights["where"] +
        (1 if group_by_match else 0) * weights["group_by"]
    )

    return {
        "row_count_match": row_match,
        "row_delta_pct": round(row_delta_pct, 3),
        "column_coverage": round(col_coverage, 2),
        "missing_columns": missing_cols,
        "extra_columns": extra_cols,
        "result_exact_match": result_exact,
        "tables_match": tables_match,
        "where_exists_match": where_exists,
        "group_by_match": group_by_match,
        "deterministic_score": round(overall_score, 2),
    }

# ------------------------------------------------------------------
# 2️⃣ AI Judge Prompt Builder
# ------------------------------------------------------------------
def build_judge_prompt(
    user_query: str,
    schema_ddl: str,
    expected_sql: str,
    llm_sql: str,
    exp_sample_csv: str,
    llm_sample_csv: str,
    exp_rows: int,
    llm_rows: int,
    exp_cols: List[str],
    llm_cols: List[str]
) -&amp;gt; str:
    """
    Returns a prompt that forces the LLM to output *only* JSON.
    The prompt includes enough context (schema, samples, row counts)
    so the model can reason about missing or extra elements.
    """

    return f"""
You are an expert SQL analyst and debugger with 15+ years experience.

Task: Deeply compare the **Expected SQL** (ground truth) with the **LLM-generated SQL**.
User natural‑language query: {user_query}

Database Schema (full DDL):
{schema_ddl}

Expected SQL (ground truth):
{expected_sql}

LLM-generated SQL:
{llm_sql}

Expected Result (first 10 rows as CSV):
{exp_sample_csv}

LLM Result (first 10 rows as CSV):
{llm_sample_csv}

Row count — Expected: {exp_rows} | LLM: {llm_rows}
Columns — Expected: {', '.join(exp_cols)} | LLM: {', '.join(llm_cols)}

Evaluate using this exact rubric. Think step‑by‑step.

1. **Semantic Correctness** (0-100): How well does the LLM SQL answer the user query compared to the expected SQL?
2. **What is MISSING?** List every important element present in Expected but absent/mis‑implemented in LLM SQL (filters, joins, aggregations, computed columns, groupings, ordering, etc.).
3. **What is EXTRA or WRONG?** List anything in LLM SQL that should not be there or is incorrect.
4. **Root Cause Hypothesis**: Why might the LLM have made these mistakes given the original prompt?
5. **Suggested Fix**: Provide a complete, corrected SQL that would pass all checks. Make minimal changes possible while fully matching the expected semantics.

Return ONLY valid JSON (no markdown):
{{
  "semantic_score": integer 0-100,
  "missing_elements": ["filter: status='ACTIVE'", "join on customer_id", ...],
  "extra_or_wrong_elements": ["unnecessary ORDER BY", ...],
  "root_cause": "string explanation",
  "suggested_improved_sql": "full corrected SQL here",
  "overall_verdict": "PASS | PARTIAL | FAIL",
  "explanation": "concise paragraph"
}}
"""

# ------------------------------------------------------------------
# 3️⃣ Batch Evaluator (async, concurrency control)
# ------------------------------------------------------------------
async def evaluate_batch(
    test_cases: List[Dict[str, Any]],
    max_parallel: int = 10,
    judge_model: str = "claude-3-5-haiku-20241022"
) -&amp;gt; List[Dict[str, Any]]:
    """
    test_cases is a list of dicts with keys:
        user_query, schema_ddl, expected_sql, llm_sql,
        expected_df, llm_df
    Returns each case enriched with deterministic scores and, if needed,
    the AI judge JSON.
    """

    semaphore = asyncio.Semaphore(max_parallel)

    async def process_one(case: Dict[str, Any]) -&amp;gt; Dict[str, Any]:
        # 1️⃣ Deterministic pass
        det_scores = deterministic_evaluate(
            case["expected_sql"],
            case["llm_sql"],
            case["expected_df"],
            case["llm_df"]
        )
        result = {**case, **det_scores}

        # 2️⃣ If high enough, skip AI judge
        if det_scores["deterministic_score"] &amp;gt;= 0.92:
            result["judge_review"] = None
            return result

        # 3️⃣ Build prompt and call LLM
        exp_sample_csv = case["expected_df"].head(10).to_csv(index=False)
        llm_sample_csv = case["llm_df"].head(10).to_csv(index=False)

        prompt = build_judge_prompt(
            user_query=case["user_query"],
            schema_ddl=case["schema_ddl"],
            expected_sql=case["expected_sql"],
            llm_sql=case["llm_sql"],
            exp_sample_csv=exp_sample_csv,
            llm_sample_csv=llm_sample_csv,
            exp_rows=len(case["expected_df"]),
            llm_rows=len(case["llm_df"]),
            exp_cols=list(case["expected_df"].columns),
            llm_cols=list(case["llm_df"].columns)
        )

        response = await acompletion(
            model=judge_model,
            messages=[{"role": "user", "content": prompt}]
        )
        # Parse the JSON safely
        try:
            review = json.loads(response.choices[0].message.content)
        except Exception as e:
            review = {"error": str(e), "raw_response": response.choices[0].message
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
      <category>sql</category>
      <category>llm</category>
      <category>evaluation</category>
      <category>python</category>
    </item>
    <item>
      <title>Streaming AI Agent with FastAPI &amp; LangGraph (2025‑26 Guide)</title>
      <dc:creator>kasi viswanath vandanapu</dc:creator>
      <pubDate>Mon, 16 Feb 2026 21:05:38 +0000</pubDate>
      <link>https://forem.com/kasi_viswanath/streaming-ai-agent-with-fastapi-langgraph-2025-26-guide-1nkn</link>
      <guid>https://forem.com/kasi_viswanath/streaming-ai-agent-with-fastapi-langgraph-2025-26-guide-1nkn</guid>
      <description>&lt;h1&gt;
  
  
  Streaming AI Agent with FastAPI &amp;amp; LangGraph (2025‑26 Guide)
&lt;/h1&gt;

&lt;p&gt;Real‑world AI workflows—data pipelines, planning bots, ETL orchestration—often leave users staring at a loading spinner for 10–60 seconds. In 2025‑26 the expectation is instant feedback: step transitions, progress bars, intermediate results, and especially human‑in‑the‑loop (HITL) pauses.&lt;br&gt;&lt;br&gt;
In this post I’ll walk you through a production‑ready stack that:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Accepts a user message &lt;strong&gt;plus an optional file upload&lt;/strong&gt; via POST&lt;/li&gt;
&lt;li&gt;Runs a LangGraph agent with planning → data‑model resolution → HITL review&lt;/li&gt;
&lt;li&gt;Streams every meaningful step and progress update to the frontend using Server‑Sent Events (SSE)&lt;/li&gt;
&lt;li&gt;Uses &lt;code&gt;get_stream_writer()&lt;/code&gt; so any node can emit clean, structured progress messages without manual callbacks&lt;/li&gt;
&lt;li&gt;Persists sessions via &lt;code&gt;thread_id / session_id&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;Comes with a minimal React front‑end that consumes the stream&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Let’s dive in.&lt;/p&gt;
&lt;h2&gt;
  
  
  Tech Stack
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Component&lt;/th&gt;
&lt;th&gt;Version / Notes&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Python&lt;/td&gt;
&lt;td&gt;3.11+ (required for reliable async &lt;code&gt;get_stream_writer()&lt;/code&gt; propagation)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;FastAPI&lt;/td&gt;
&lt;td&gt;0.115+ (built‑in streaming &amp;amp; file upload support)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;LangGraph&lt;/td&gt;
&lt;td&gt;0.2.70+ (latest as of Feb 2026)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;LangChain ecosystem&lt;/td&gt;
&lt;td&gt;0.1.x+&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;python-multipart&lt;/td&gt;
&lt;td&gt;
&lt;code&gt;pip install python-multipart&lt;/code&gt; (for &lt;code&gt;UploadFile&lt;/code&gt;)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;React&lt;/td&gt;
&lt;td&gt;18+ (TypeScript optional, shown with plain JS for brevity)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Uvicorn&lt;/td&gt;
&lt;td&gt;For running the FastAPI app locally&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;Gotcha:&lt;/strong&gt; Use Python ≥ 3.11 to avoid &lt;code&gt;ContextVar&lt;/code&gt; issues when propagating the stream writer across async tasks.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h2&gt;
  
  
  Project Structure
&lt;/h2&gt;


&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;streaming-ai-agent/
├── app/
│   ├── __init__.py
│   ├── main.py               # FastAPI app entry point
│   ├── api/
│   │   └── router.py         # /api/stream endpoint
│   ├── graph/
│   │   ├── __init__.py
│   │   ├── state.py          # TypedDict for AgentState
│   │   ├── nodes/
│   │   │   ├── __init__.py
│   │   │   ├── plan_data_model.py
│   │   │   └── hitl_review.py
│   │   └── service.py        # Graph builder + streaming logic
│   └── dependencies.py       # (optional) shared checkpointer
├── frontend/                 # React app (Vite / CRA)
│   └── src/
│       └── App.jsx           # Streaming consumer example
├── requirements.txt
└── README.md
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;h2&gt;
  
  
  Step 1: Define the State
&lt;/h2&gt;

&lt;p&gt;We’ll use a &lt;code&gt;TypedDict&lt;/code&gt; with &lt;code&gt;Annotated&lt;/code&gt; to let LangGraph know how to merge message lists.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# app/graph/state.py
&lt;/span&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;typing&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;Annotated&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;TypedDict&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;List&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;langgraph.graph.message&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;add_messages&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;langchain_core.messages&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;BaseMessage&lt;/span&gt;

&lt;span class="k"&gt;class&lt;/span&gt; &lt;span class="nc"&gt;AgentState&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;TypedDict&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
    &lt;span class="n"&gt;messages&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;Annotated&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;List&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;BaseMessage&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt; &lt;span class="n"&gt;add_messages&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
    &lt;span class="n"&gt;user_message&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;
    &lt;span class="n"&gt;file_content&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
    &lt;span class="n"&gt;data_model_plan&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;dict&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
    &lt;span class="n"&gt;resolved_model&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;dict&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
    &lt;span class="n"&gt;human_feedback&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
    &lt;span class="n"&gt;status&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Step 2: Create Nodes with Progress Emission
&lt;/h2&gt;

&lt;p&gt;Each node pulls the stream writer, emits a structured event (JSON‑serialisable &lt;code&gt;dict&lt;/code&gt;), and updates state. No real LLM calls are needed for this demo.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# app/graph/nodes/plan_data_model.py
&lt;/span&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;langgraph.config&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;get_stream_writer&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;..state&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;AgentState&lt;/span&gt;

&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;plan_data_model_node&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;state&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;AgentState&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="n"&gt;AgentState&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="n"&gt;writer&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;get_stream_writer&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

    &lt;span class="nf"&gt;writer&lt;/span&gt;&lt;span class="p"&gt;({&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;type&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;step_start&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;step&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;plan_data_model&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;message&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Analyzing user request and file (if any)...&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
    &lt;span class="p"&gt;})&lt;/span&gt;

    &lt;span class="c1"&gt;# Simulated planning logic
&lt;/span&gt;    &lt;span class="n"&gt;plan&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;entities&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;user&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;product&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;relationships&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;user → order&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order → product&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
    &lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;state&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;file_content&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
        &lt;span class="n"&gt;plan&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;note&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;CSV detected → adding transaction fields&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;

    &lt;span class="nf"&gt;writer&lt;/span&gt;&lt;span class="p"&gt;({&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;type&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;step_update&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;step&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;plan_data_model&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;progress&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="mf"&gt;0.6&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;message&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Generated initial data model plan&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
    &lt;span class="p"&gt;})&lt;/span&gt;

    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
        &lt;span class="o"&gt;**&lt;/span&gt;&lt;span class="n"&gt;state&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;data_model_plan&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;plan&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;status&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;planning_complete&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
    &lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# app/graph/nodes/hitl_review.py
&lt;/span&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;langgraph.config&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;get_stream_writer&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;langgraph.types&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;interrupt&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;..state&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;AgentState&lt;/span&gt;

&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;hitl_review_node&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;state&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;AgentState&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="n"&gt;AgentState&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="n"&gt;writer&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;get_stream_writer&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

    &lt;span class="nf"&gt;writer&lt;/span&gt;&lt;span class="p"&gt;({&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;type&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;waiting_human&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;step&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;human_review&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;message&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Please review the proposed data model&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;plan&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;state&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;data_model_plan&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
    &lt;span class="p"&gt;})&lt;/span&gt;

    &lt;span class="c1"&gt;# Raise an interrupt so the agent pauses until we resume with feedback
&lt;/span&gt;    &lt;span class="k"&gt;raise&lt;/span&gt; &lt;span class="nf"&gt;interrupt&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;needs_review&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="c1"&gt;# Code after this line runs only when resumed
&lt;/span&gt;    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;state&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Step 3: Build &amp;amp; Stream the Graph (&lt;code&gt;service.py&lt;/code&gt;)
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# app/graph/service.py
&lt;/span&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;langgraph.graph&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;StateGraph&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;START&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;END&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;langgraph.checkpoint.memory&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;MemorySaver&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;.state&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;AgentState&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;.nodes.plan_data_model&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;plan_data_model_node&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;.nodes.hitl_review&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;hitl_review_node&lt;/span&gt;

&lt;span class="n"&gt;checkpointer&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;MemorySaver&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

&lt;span class="n"&gt;graph&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;StateGraph&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;state_schema&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;AgentState&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Register nodes
&lt;/span&gt;&lt;span class="n"&gt;graph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add_node&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;plan_data_model&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;plan_data_model_node&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;graph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add_node&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;human_review&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;hitl_review_node&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Define edges
&lt;/span&gt;&lt;span class="n"&gt;graph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add_edge&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;START&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;plan_data_model&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;graph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add_edge&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;plan_data_model&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;human_review&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;graph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add_edge&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;human_review&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;END&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;  &lt;span class="c1"&gt;# or loop back if resumed
&lt;/span&gt;
&lt;span class="n"&gt;compiled_graph&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;graph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;compile&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;checkpointer&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;checkpointer&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="k"&gt;async&lt;/span&gt; &lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;stream_agent_process&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;inputs&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;dict&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;thread_id&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;file_content&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
&lt;span class="p"&gt;):&lt;/span&gt;
    &lt;span class="n"&gt;config&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;configurable&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;thread_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;thread_id&lt;/span&gt;&lt;span class="p"&gt;}}&lt;/span&gt;

    &lt;span class="n"&gt;initial_state&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;messages&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;[],&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;user_message&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;inputs&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;user_message&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;""&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;file_content&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;file_content&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;status&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;starting&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
    &lt;span class="p"&gt;}&lt;/span&gt;

    &lt;span class="k"&gt;async&lt;/span&gt; &lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;event&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;compiled_graph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;astream&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="n"&gt;initial_state&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="n"&gt;config&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;config&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="n"&gt;stream_mode&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;custom&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;updates&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;messages&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;
        &lt;span class="n"&gt;subgraphs&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;
    &lt;span class="p"&gt;):&lt;/span&gt;
        &lt;span class="c1"&gt;# When `subgraphs=True`, events are tuples (namespace, payload)
&lt;/span&gt;        &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="nf"&gt;isinstance&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;event&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nb"&gt;tuple&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
            &lt;span class="n"&gt;mode&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;payload&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;event&lt;/span&gt;
        &lt;span class="k"&gt;else&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
            &lt;span class="n"&gt;mode&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;payload&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;unknown&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;event&lt;/span&gt;

        &lt;span class="k"&gt;yield&lt;/span&gt; &lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;data: &lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;json&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;dumps&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;mode&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="si"&gt;:&lt;/span&gt; &lt;span class="n"&gt;mode&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;payload&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="si"&gt;:&lt;/span&gt; &lt;span class="n"&gt;payload&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="se"&gt;\n\n&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Important gotchas&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;subgraphs=True&lt;/code&gt; → events become &lt;code&gt;(namespace, data)&lt;/code&gt; tuples.&lt;/li&gt;
&lt;li&gt;Combine modes (&lt;code&gt;["custom","updates","messages"]&lt;/code&gt;) to get progress events &lt;em&gt;and&lt;/em&gt; state updates.&lt;/li&gt;
&lt;li&gt;Use &lt;code&gt;json.dumps()&lt;/code&gt; so the SSE payload is safe and parseable on the client.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Step 4: FastAPI Streaming Endpoint with File Upload
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# app/api/router.py
&lt;/span&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;fastapi&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;APIRouter&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;UploadFile&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;Form&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;HTTPException&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;fastapi.responses&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;StreamingResponse&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;..graph.service&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;stream_agent_process&lt;/span&gt;

&lt;span class="n"&gt;router&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;APIRouter&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;prefix&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;/api&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="nd"&gt;@router.post&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;/stream&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="k"&gt;async&lt;/span&gt; &lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;stream_agent&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;user_message&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;Form&lt;/span&gt;&lt;span class="p"&gt;(...),&lt;/span&gt;
    &lt;span class="n"&gt;session_id&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;Form&lt;/span&gt;&lt;span class="p"&gt;(...),&lt;/span&gt;
    &lt;span class="nb"&gt;file&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;UploadFile&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
&lt;span class="p"&gt;):&lt;/span&gt;
    &lt;span class="n"&gt;file_content&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
    &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="nb"&gt;file&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="n"&gt;content&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="k"&gt;await&lt;/span&gt; &lt;span class="nb"&gt;file&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;read&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
        &lt;span class="n"&gt;file_content&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;content&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;decode&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;utf-8&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;errors&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;ignore&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="k"&gt;async&lt;/span&gt; &lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;event_generator&lt;/span&gt;&lt;span class="p"&gt;():&lt;/span&gt;
        &lt;span class="k"&gt;try&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
            &lt;span class="k"&gt;async&lt;/span&gt; &lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;chunk&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="nf"&gt;stream_agent_process&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
                &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;user_message&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;user_message&lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt;
                &lt;span class="n"&gt;thread_id&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;session_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
                &lt;span class="n"&gt;file_content&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;file_content&lt;/span&gt;
            &lt;span class="p"&gt;):&lt;/span&gt;
                &lt;span class="k"&gt;yield&lt;/span&gt; &lt;span class="n"&gt;chunk&lt;/span&gt;
        &lt;span class="k"&gt;except&lt;/span&gt; &lt;span class="nb"&gt;Exception&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;e&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
            &lt;span class="k"&gt;yield&lt;/span&gt; &lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;data: &lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;json&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;dumps&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;error&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="si"&gt;:&lt;/span&gt; &lt;span class="nf"&gt;str&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;e&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="se"&gt;\n\n&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;

    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="nc"&gt;StreamingResponse&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="nf"&gt;event_generator&lt;/span&gt;&lt;span class="p"&gt;(),&lt;/span&gt;
        &lt;span class="n"&gt;media_type&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;text/event-stream&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="n"&gt;headers&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Cache-Control&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;no-cache&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Connection&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;keep-alive&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;X-Accel-Buffering&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;no&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;  &lt;span class="c1"&gt;# Important for nginx proxy
&lt;/span&gt;        &lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Add &lt;code&gt;router&lt;/code&gt; to your FastAPI app in &lt;code&gt;main.py&lt;/code&gt;.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;Do not set&lt;/strong&gt; the &lt;code&gt;Content-Type: application/json&lt;/code&gt; header on the frontend fetch; SSE expects &lt;code&gt;text/event-stream&lt;/code&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  Step 5: React Frontend – Consume the Stream
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
jsx
// frontend/src/App.jsx
import { useState } from "react";

function App() {
  const [logs, setLogs] = useState([]);
  const [status, setStatus] = useState("Ready");
  const sessionId = crypto.randomUUID();

  const handleSubmit = async (e) =&amp;gt; {
    e.preventDefault();
    const form = e.target;
    const message = form.message.value;
    const fileInput = form.file.files?.[0];

    const formData = new FormData();
    formData.append("user_message", message);
    formData.append("session_id", sessionId);
    if (fileInput) formData.append("file", fileInput);

    const response = await fetch("/api/stream", {
      method: "POST",
      body: formData
    });

    if (!response.body) return;

    const reader = response.body.getReader();
    const decoder = new TextDecoder();

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      const chunk = decoder.decode(value);
      const lines = chunk.split("\n");

      for (const line of lines) {
        if (!line.startsWith("data: ")) continue;
        try {
          const data = JSON.parse(line.slice(6));
          const { mode, payload } = data;

          if (mode === "custom") {
            switch (payload.type) {
              case "step_start":
                setLogs((l) =&amp;gt; [...l, `→ Starting: ${payload.step}`]);
                break;
              case "step_update":
                setLogs((l) =&amp;gt; [
                  ...l,
                  `  Update: ${payload.message} (${Math.round(payload.progress * 100)}%)`
                ]);
                break;
              case "waiting_human":
                setStatus("Waiting for your review…");
                setLogs((l) =&amp;gt; [...l
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
      <category>fastapi</category>
      <category>langgraph</category>
      <category>streaming</category>
      <category>react</category>
    </item>
  </channel>
</rss>
