<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>Forem: Mrunmayee Rane</title>
    <description>The latest articles on Forem by Mrunmayee Rane (@mrunmayee_rane_9d0e22b4de).</description>
    <link>https://forem.com/mrunmayee_rane_9d0e22b4de</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3898839%2F6d6a1ffe-aee1-4ea9-8656-6712cf4e2115.jpg</url>
      <title>Forem: Mrunmayee Rane</title>
      <link>https://forem.com/mrunmayee_rane_9d0e22b4de</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://forem.com/feed/mrunmayee_rane_9d0e22b4de"/>
    <language>en</language>
    <item>
      <title>This post is for engineers building agentic harness — especially if you are thinking about tools, memory, evals, observability, and production reliability.</title>
      <dc:creator>Mrunmayee Rane</dc:creator>
      <pubDate>Tue, 26 May 2026 08:01:36 +0000</pubDate>
      <link>https://forem.com/mrunmayee_rane_9d0e22b4de/this-post-is-for-engineers-building-agentic-harness-especially-if-you-are-thinking-about-tools-57pp</link>
      <guid>https://forem.com/mrunmayee_rane_9d0e22b4de/this-post-is-for-engineers-building-agentic-harness-especially-if-you-are-thinking-about-tools-57pp</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/mrunmayee_rane_9d0e22b4de/the-agentic-harness-how-to-build-ai-agents-in-production-1id3" class="crayons-story__hidden-navigation-link"&gt;The Agentic Harness: How to Build AI Agents in Production&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/mrunmayee_rane_9d0e22b4de" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3898839%2F6d6a1ffe-aee1-4ea9-8656-6712cf4e2115.jpg" alt="mrunmayee_rane_9d0e22b4de profile" class="crayons-avatar__image"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/mrunmayee_rane_9d0e22b4de" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Mrunmayee Rane
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Mrunmayee Rane
                
              
              &lt;div id="story-author-preview-content-3755068" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/mrunmayee_rane_9d0e22b4de" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3898839%2F6d6a1ffe-aee1-4ea9-8656-6712cf4e2115.jpg" class="crayons-avatar__image" alt=""&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Mrunmayee Rane&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/mrunmayee_rane_9d0e22b4de/the-agentic-harness-how-to-build-ai-agents-in-production-1id3" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;May 25&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/mrunmayee_rane_9d0e22b4de/the-agentic-harness-how-to-build-ai-agents-in-production-1id3" id="article-link-3755068"&gt;
          The Agentic Harness: How to Build AI Agents in Production
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/agents"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;agents&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/llm"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;llm&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/agenticharness"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;agenticharness&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/mrunmayee_rane_9d0e22b4de/the-agentic-harness-how-to-build-ai-agents-in-production-1id3#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              Comments


              &lt;span class="hidden s:inline"&gt;Add Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            12 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
    </item>
    <item>
      <title>The Agentic Harness: How to Build AI Agents in Production</title>
      <dc:creator>Mrunmayee Rane</dc:creator>
      <pubDate>Mon, 25 May 2026 07:00:00 +0000</pubDate>
      <link>https://forem.com/mrunmayee_rane_9d0e22b4de/the-agentic-harness-how-to-build-ai-agents-in-production-1id3</link>
      <guid>https://forem.com/mrunmayee_rane_9d0e22b4de/the-agentic-harness-how-to-build-ai-agents-in-production-1id3</guid>
      <description>

&lt;p&gt;Most people are still building AI agents like demos.&lt;/p&gt;

&lt;p&gt;They connect an LLM to a few tools, add a system prompt, wrap everything in a chat UI, and call it an agent.&lt;/p&gt;

&lt;p&gt;That is not an agent system.&lt;/p&gt;

&lt;p&gt;That is a model with tool access.&lt;/p&gt;

&lt;p&gt;A real AI agent is not just a prompt, a model, or a framework. A real AI agent is an engineered runtime.&lt;/p&gt;

&lt;p&gt;It needs a harness.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fvvavh5wj19ynojp4gyg9.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fvvavh5wj19ynojp4gyg9.png" alt="Best Practices for Agentic Harness in Production" width="800" height="450"&gt;&lt;/a&gt;&lt;br&gt;
The &lt;strong&gt;agentic harness&lt;/strong&gt; is the system around the model that makes agent behavior useful, repeatable, observable, and safe.&lt;/p&gt;

&lt;p&gt;It decides:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;How the model receives context&lt;/li&gt;
&lt;li&gt;How it uses tools&lt;/li&gt;
&lt;li&gt;How progress is persisted&lt;/li&gt;
&lt;li&gt;How failures are handled&lt;/li&gt;
&lt;li&gt;How work is evaluated&lt;/li&gt;
&lt;li&gt;How the system improves over time&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;The mindset shift is simple:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;The model is not the product.&lt;br&gt;&lt;br&gt;
The harness around the model is the product.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;A stronger model can improve reasoning.&lt;/p&gt;

&lt;p&gt;But the harness determines whether that reasoning turns into reliable action.&lt;/p&gt;


&lt;h2&gt;
  
  
  &lt;strong&gt;What Is an Agentic Harness?&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;An agentic harness is the runtime layer that enables a model to behave like an agent.&lt;/p&gt;

&lt;p&gt;It receives a task, loads the right instructions and context, exposes the right tools, manages the execution loop, captures state, verifies progress, handles errors, records traces, and returns the final result.&lt;/p&gt;

&lt;p&gt;A simple version looks like this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Receive task
2. Load identity
3. Load task instructions
4. Load relevant context
5. Retrieve memory
6. Select tools
7. Plan next action
8. Execute tool call
9. Observe result
10. Update state
11. Verify outcome
12. Write durable progress
13. Return response
14. Record trace
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The important part is not that every agent uses this exact loop.&lt;/p&gt;

&lt;p&gt;The important part is that the loop exists outside the model.&lt;/p&gt;

&lt;p&gt;A weak agent relies on the model to figure everything out inside one giant context window.&lt;/p&gt;

&lt;p&gt;A strong agent externalizes responsibilities into the harness:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Identity lives in a stable instruction layer
2. Memory lives outside the prompt
3. Skills live as reusable procedures
4. Tools expose controlled actions
5. Policies constrain execution
6. Progress files preserve continuity
7. Traces capture behavior
8. Evals measure outcomes and trajectories
9. Governance defines ownership
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The model should reason.&lt;/p&gt;

&lt;p&gt;The harness should govern.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Do not put everything inside the prompt. Build the system around the prompt.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Start Simple, Then Add Agency Only Where It Pays Off&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;One of the biggest mistakes in agent development is adding autonomy too early.&lt;/p&gt;

&lt;p&gt;Not every AI system needs to be an agent.&lt;/p&gt;

&lt;p&gt;Some tasks are better served by:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. A single model call
2. Retrieval-augmented generation
3. A deterministic workflow
4. A simple rules engine
5. A human review flow
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Some tasks genuinely need an agent that can decide what to do next, use tools, and adapt across multiple turns.&lt;/p&gt;

&lt;p&gt;A useful distinction:&lt;/p&gt;

&lt;p&gt;Workflow: the system follows predefined code paths.&lt;br&gt;
Agent: the model dynamically decides its process and tool usage.&lt;/p&gt;

&lt;p&gt;A good harness lets you mix both.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;User request
   ↓
Intent router
   ↓
Simple task?   → deterministic workflow
Complex task?  → agent loop
High-risk task? → human review gate
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This gives you a practical architecture:&lt;/p&gt;

&lt;p&gt;Keep deterministic paths deterministic.&lt;/p&gt;

&lt;p&gt;Reserve agentic behavior for places where model-driven decision-making actually creates value.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Use the simplest system that can solve the task. Add agency only when flexibility is worth the cost.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Define the Agent’s Operating Identity&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Before memory, tools, skills, and evals, the agent needs identity.&lt;/p&gt;

&lt;p&gt;Identity is not personality decoration.&lt;/p&gt;

&lt;p&gt;It is behavioral control.&lt;/p&gt;

&lt;p&gt;A weak identity says:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;You are a helpful AI assistant.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;That does almost nothing.&lt;/p&gt;

&lt;p&gt;A stronger identity says:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;You are a pragmatic staff engineer operating in production systems. You optimize for correctness, reliability, maintainability, and small safe diffs. You read before editing. You verify before claiming completion. You preserve existing architecture unless the architecture itself is the failure. You surface uncertainty instead of hiding it.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This gives the model an operating posture.&lt;/p&gt;

&lt;p&gt;In a real harness, this identity can live in a stable file such as:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. SOUL.md
2. AGENTS.md
3. system profile
4. team-owned instruction file
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;It should define:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Who the agent is
2. What it optimizes for
3. How it communicates
4. What it refuses to do
5. How it uses tools
6. What it remembers
7. What it ignores
8. When it asks for help
9. When it stops
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Example:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;## Core Truths

- Read before writing.
  Existing systems contain context that the prompt does not.

- Small diffs beat broad rewrites.
  Local fixes are safer unless the abstraction itself is broken.

- Verification is part of the task.
  Never claim success without evidence.

- Production systems punish cleverness.
  Prefer explicit, observable, boring solutions.

- Uncertainty must be surfaced.
  A confident guess is worse than a clearly labeled assumption.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;A next-level agent needs judgment, not just capability.&lt;/p&gt;

&lt;p&gt;Identity is where that judgment starts.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Give the agent a stable operating profile before giving it powerful tools.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Make the Execution Contract Explicit&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Every agent should have an execution contract.&lt;/p&gt;

&lt;p&gt;The execution contract tells the agent how work moves from task to completion.&lt;/p&gt;

&lt;p&gt;For a coding agent, the contract might be:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Understand the request.
2. Inspect relevant files.
3. Identify the smallest safe change.
4. Apply the change.
5. Run targeted tests.
6. Run broader tests if risk is high.
7. Summarize the diff.
8. Document verification.
9. List residual risk.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Without this contract, the agent improvises.&lt;/p&gt;

&lt;p&gt;Improvisation is fine for chat.&lt;/p&gt;

&lt;p&gt;It is dangerous for production systems.&lt;/p&gt;

&lt;p&gt;A better coding-agent instruction looks like this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;You are debugging a production Python service.

Mission:
Find the smallest safe fix.

Workflow:

1. Read the exact error.
2. Inspect the file where the error originates.
3. Inspect the caller.
4. Search for similar patterns in the repository.
5. Identify the smallest local fix.
6. Apply the patch.
7. Run the narrowest relevant test.
8. If the touched surface is broad, run the related suite.
9. Report changed files, verification, and remaining risks.

Rules:

- Do not edit before reading.
- Do not introduce dependencies unless existing tools are insufficient.
- Do not rewrite modules for local bugs.
- Do not claim tests passed unless the command actually ran.
- Do not suppress uncertainty.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This is what separates an agent from a chatbot.&lt;/p&gt;

&lt;p&gt;The chatbot answers.&lt;/p&gt;

&lt;p&gt;The agent follows an execution contract.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Define how the agent starts, acts, verifies, and stops.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Treat Tools as Privileged Interfaces&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Most agent demos expose tools too casually.&lt;/p&gt;

&lt;p&gt;They give the model a shell, browser, database, file editor, or API client and trust the prompt to keep behavior sane.&lt;/p&gt;

&lt;p&gt;That is not enough.&lt;/p&gt;

&lt;p&gt;Tool use needs policy.&lt;/p&gt;

&lt;p&gt;For every tool, define:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. When to use it
2. When not to use it
3. Required preconditions
4. Allowed scope
5. Failure behavior
6. Retry limits
7. Logging requirements
8. Approval boundaries
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Example:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;## Shell Tool Policy

Use shell for:
- running tests
- inspecting repo structure
- searching files
- checking git state

Do not use shell for:
- destructive commands
- credential access
- broad file deletion
- installing dependencies without approval

Before mutation:
- inspect target files
- check git status
- prefer minimal commands

After mutation:
- run relevant verification
- summarize command output
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Every tool expands the agent’s action space.&lt;/p&gt;

&lt;p&gt;A larger action space means more capability, but also more failure modes.&lt;/p&gt;

&lt;p&gt;The harness should make tool use scoped, observable, and reversible where possible.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Tools should be powerful, scoped, observable, and policy-constrained.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Engineer Context Like a Runtime Resource&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Context is not a giant text box.&lt;/p&gt;

&lt;p&gt;Context is working memory.&lt;/p&gt;

&lt;p&gt;If you treat the context window like a dumping ground, agent quality degrades.&lt;/p&gt;

&lt;p&gt;The agent becomes distracted. Stale information competes with fresh information. The model starts to miss details that should have been obvious.&lt;/p&gt;

&lt;p&gt;A better mental model is a memory hierarchy:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;L0: stable identity
L1: task instructions
L2: active working context
L3: retrieved project context
L4: long-term memory
L5: external documents and tools
L6: durable progress artifacts
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Each layer has a job.&lt;/p&gt;

&lt;p&gt;The identity layer should be small and stable.&lt;/p&gt;

&lt;p&gt;The task layer should be specific.&lt;/p&gt;

&lt;p&gt;Retrieved context should be relevant and fresh.&lt;/p&gt;

&lt;p&gt;Memory should contain durable facts, not noise.&lt;/p&gt;

&lt;p&gt;Tool outputs should be summarized instead of blindly appended forever.&lt;/p&gt;

&lt;p&gt;Progress artifacts should preserve state across sessions.&lt;/p&gt;

&lt;p&gt;Context engineering asks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;What must be in the prompt?
What can be retrieved on demand?
What should be summarized?
What should be persisted?
What should be forgotten?
What should never enter context?
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;More context is not always better.&lt;/p&gt;

&lt;p&gt;Better-routed context is better.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Treat context like RAM, not storage.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Build Durable State Outside the Context Window&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Long-running agents fail when all state lives in the chat.&lt;/p&gt;

&lt;p&gt;Eventually, the context compresses, degrades, or disappears.&lt;/p&gt;

&lt;p&gt;The agent forgets why it made a decision, repeats work, loses track of tests, or declares success without remembering what is still broken.&lt;/p&gt;

&lt;p&gt;A serious harness needs durable progress artifacts.&lt;/p&gt;

&lt;p&gt;Examples:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;PROGRESS.md
PLAN.md
DECISIONS.md
RISKS.md
TODO.md
CHANGELOG.md
git commits
trace logs
test reports
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;A weak long-running agent does this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Make many changes
2. Lose context
3. Forget why
4. Declare success
5. Leave broken state
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;A strong long-running agent does this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Read progress
2. Select one task
3. Make a small change
4. Run verification
5. Commit or checkpoint
6. Update progress
7. Record risks
8. Continue
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For coding agents, a good PROGRESS.md might look like this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;## Current Goal

Implement scoped retry handling for failed ingestion jobs.

## Completed

- Identified retry path in worker.py
- Added unit test for transient network failure
- Confirmed existing backoff utility exists

## In Progress

- Wiring retry policy into ingestion worker

## Blockers

- Need to confirm max retry count for production

## Next Step

- Add integration test for failed job replay

## Risks

- Duplicate processing if idempotency key is missing
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This gives the next agent session a clean handoff.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Long-running agents need state outside the model.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Separate Memory From Skills&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Many agent systems confuse memory and skills.&lt;/p&gt;

&lt;p&gt;They are not the same. Memory stores facts. Skills store procedures.&lt;/p&gt;

&lt;p&gt;Memory answers:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;What does the agent know?&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;Skills answer:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;How does the agent do something?&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;Examples of memory:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;The project uses Poetry.
The user prefers concise technical explanations.
The staging deploy requires manual approval.
The API gateway owns refresh-token handling.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Examples of skills:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;How to debug a failing Kubernetes pod.
How to review a pull request.
How to investigate a latency regression.
How to create a database migration safely.
How to summarize a production incident.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;A skill should be structured and reusable:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;---
name: latency-regression-debug
description: Use when p95/p99 latency increases after a deploy.
version: 1.0.0
---

## When to Use

Use when latency regression is reported after a code, config, infra, or model change.

## Procedure

1. Identify affected endpoint or job.
2. Compare p50, p95, and p99 before and after deploy.
3. Check recent diffs.
4. Inspect dependency latency.
5. Check queue depth and saturation.
6. Reproduce with a controlled benchmark if possible.
7. Propose the smallest reversible fix.

## Pitfalls

- Optimizing average latency while ignoring p99.
- Blaming the database before checking queueing.
- Ignoring cold starts.
- Comparing different traffic windows.

## Verification

- Same traffic class.
- Same time window.
- p95/p99 restored.
- No regression in error rate.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This is procedural memory. It helps the agent avoid rediscovering workflows.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Facts go into memory. Repeatable procedures become skills.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Build the Evaluation Harness With the Agent Harness&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Agent evals are harder than normal LLM evals.&lt;/p&gt;

&lt;p&gt;A chatbot produces an answer. An agent produces a trajectory.&lt;/p&gt;

&lt;p&gt;That trajectory includes:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Tool calls
2. File reads
3. Edits
4. API calls
5. Retries
6. Failures
7. Recoveries
8. Test runs
9. Final output
10. State changes
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;A final answer can look correct while the trajectory is bad.&lt;/p&gt;

&lt;p&gt;For example, the test passes, but the agent:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Edited the wrong abstraction
2. Ignored an existing helper
3. Introduced duplicate logic
4. Skipped security-sensitive checks
5. Used 40 unnecessary tool calls
6. Failed to document risk
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;That should not be a full pass.&lt;/p&gt;

&lt;p&gt;A serious eval harness should measure both outcome quality and process quality.&lt;/p&gt;

&lt;p&gt;For agent systems, useful eval dimensions include:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Task success
2. Tool selection
3. Tool efficiency
4. State changes
5. Policy violations
6. Latency
7. Token cost
8. Retry behavior
9. Verification quality
10. Diff quality
11. Failure recovery
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The key idea is simple:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Agent harness = runs the agent
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Eval harness = runs the agent against tasks,
captures traces, grades outcomes,
and aggregates results
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;You need both.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Evaluate the trajectory, not just the answer.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Use Macro Evals to Debug Systemic Failures&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Single-run debugging is not enough.&lt;/p&gt;

&lt;p&gt;Agent systems fail in patterns.&lt;/p&gt;

&lt;p&gt;Examples:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Planner delegates too late
Researcher over-collects sources
Coder edits before reading
Reviewer focuses on style instead of correctness
Memory retrieval injects stale context
Tool retry loop burns tokens
Subagents duplicate work
Escalation happens too late
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Macro evals look across many traces to identify repeated failure modes.&lt;/p&gt;

&lt;p&gt;The workflow looks like this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Collect traces
2. Score individual runs
3. Compress traces into comparable summaries
4. Cluster recurring behavior patterns
5. Rank patterns by impact
6. Inspect representative examples
7. Patch system behavior
8. Rerun evals
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This moves you from anecdotal debugging to distribution-level engineering.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Instead of asking:&lt;/p&gt;

&lt;p&gt;Why did this one run fail?&lt;/p&gt;

&lt;p&gt;Ask:&lt;/p&gt;

&lt;p&gt;What class of runs fails, and what system behavior causes it?&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;That is the difference between debugging an example and improving a platform.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Beginners debug examples. Advanced teams debug failure distributions.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Measure Reliability, Not Just Capability&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Agents are nondeterministic.&lt;/p&gt;

&lt;p&gt;One successful run does not mean the system is reliable.&lt;/p&gt;

&lt;p&gt;Two useful metrics are:&lt;/p&gt;

&lt;p&gt;pass@k = did at least one of k attempts succeed?&lt;/p&gt;

&lt;p&gt;pass^k = did all k attempts succeed?&lt;/p&gt;

&lt;p&gt;These measure different things.&lt;/p&gt;

&lt;p&gt;pass@k measures capability.&lt;/p&gt;

&lt;p&gt;It asks whether the system can solve the task if given multiple chances.&lt;/p&gt;

&lt;p&gt;pass^k measures consistency.&lt;/p&gt;

&lt;p&gt;It asks whether the system succeeds every time.&lt;/p&gt;

&lt;p&gt;A coding agent that solves a task once out of five attempts is capable.&lt;/p&gt;

&lt;p&gt;It is not reliable.&lt;/p&gt;

&lt;p&gt;A support agent that gives the correct policy once but fails randomly later is dangerous.&lt;/p&gt;

&lt;p&gt;Track:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Success rate
2. Variance
3. Retry count
4. Cost per success
5. Latency per success
6. Tool calls per success
7. Failure categories
8. Recovery rate
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;blockquote&gt;
&lt;p&gt;Practical rule: Production agents need consistency, not occasional brilliance.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Design Failure Handling Explicitly&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Most agent demos ignore failure handling.&lt;/p&gt;

&lt;p&gt;Real systems cannot.&lt;/p&gt;

&lt;p&gt;Every agent needs a failure model.&lt;/p&gt;

&lt;p&gt;Define what happens when:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. A tool call fails
2. Retrieval returns stale context
3. Tests fail
4. An API rate limit is hit
5. The agent loops
6. Required context is missing
7. Permissions are insufficient
8. Output confidence is low
9. Subagents disagree
10. Verification cannot be completed
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;A good failure policy looks like this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;## Failure Policy

If a tool fails:
- retry once if the failure is transient
- do not retry destructive actions automatically
- summarize the failure
- choose an alternate path if available

If tests fail:
- inspect the failure
- make at most one targeted fix
- rerun the narrow test
- if still failing, stop and report

If context is insufficient:
- state what is missing
- proceed only with clearly labeled assumptions
- avoid irreversible actions
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Agents should not silently push through uncertainty.&lt;/p&gt;

&lt;p&gt;A reliable agent knows when to continue, when to retry, and when to stop.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Failure handling is part of the harness, not an afterthought.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Use Multi-Agent Systems Only When Coordination Pays Off&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Multi-agent systems sound advanced.&lt;/p&gt;

&lt;p&gt;Often they are just expensive chaos.&lt;/p&gt;

&lt;p&gt;Use multiple agents only when the task benefits from parallelism or specialization.&lt;/p&gt;

&lt;p&gt;Good fits:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Broad research
2. Multi-source investigation
3. Red-team / blue-team review
4. Planner-coder-reviewer workflows
5. Independent verification
6. Large codebase exploration
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Bad fits:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Simple Q&amp;amp;A
2. Small code edits
3. Basic summarization
4. Single-file changes
5. Narrow classification
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;A useful architecture:

 Lead agent
  owns task framing, planning, and synthesis

 Research agents
   explore independent branches

 Coder agent
  makes implementation changes

 Reviewer agent
  checks correctness, safety, and regressions

 Verifier agent
  runs tests and validates outputs
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Important harness rules:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Give each agent a narrow role
2. Set token and tool budgets
3. Require compressed findings
4. Avoid raw context dumps
5. Prevent duplicate work
6. Define handoff contracts
7. Evaluate the system as a whole
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Multi-agent systems are not automatically better.&lt;/p&gt;

&lt;p&gt;They are better only when coordination is cheaper than sequential work.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: Add agents when specialization creates leverage, not because the diagram looks impressive.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Add Observability From Day One&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;If you cannot inspect an agent, you cannot improve it.&lt;/p&gt;

&lt;p&gt;A production-grade harness should emit traces.&lt;/p&gt;

&lt;p&gt;Capture:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Input task
2. Loaded context
3. Retrieved memories
4. Selected skills
5. Tool calls
6. Tool outputs
7. State transitions
8. Errors
9. Retries
10. Final answer
11. Cost
12. Latency
13. User feedback
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Without traces, you cannot answer:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Why did the agent choose this tool?
Why did it ignore the relevant file?
Why did it retrieve stale memory?
Why did it loop?
Why did cost spike?
Why did the final answer look correct but fail?
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Observability enables debugging, evals, macro analysis, cost control, policy enforcement, skill improvement, and memory cleanup.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: No traces, no serious agent engineering.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Put Governance Around the Harness&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Agent adoption is not only technical.&lt;/p&gt;

&lt;p&gt;It is organizational.&lt;/p&gt;

&lt;p&gt;Without governance:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Every developer writes their own prompts
2. Permissions drift
3. Skills duplicate
4. Memory gets messy
5. Evals are missing
6. Tools are unsafe
7. Nobody owns regressions
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;With governance:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Shared configs
2. Shared skills
3. Shared evals
4. Clear permissions
5. Standard review process
6. Centralized observability
7. Safer rollout
8. Faster onboarding
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Every serious agent platform needs a DRI.&lt;/p&gt;

&lt;p&gt;Someone must own:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Identity files
2. Tool policies
3. Memory policy
4. Skill library
5. Eval suite
6. Permission model
7. Release process
8. Incident review
9. Documentation
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Bottom-up experimentation creates energy.&lt;/p&gt;

&lt;p&gt;Governance turns it into infrastructure.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Practical rule: If nobody owns the harness, nobody owns the agent.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fosuwh4tfvlf3ugxwh6ls.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fosuwh4tfvlf3ugxwh6ls.png" alt="Agentic Harness in Production" width="800" height="457"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Final Takeaway&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Next-level AI agents are not built by writing bigger prompts.&lt;/p&gt;

&lt;p&gt;They are built by engineering better harnesses.&lt;/p&gt;

&lt;p&gt;The model is the reasoning engine.&lt;/p&gt;

&lt;p&gt;The harness is the operating system around it.&lt;/p&gt;

&lt;p&gt;A serious agentic harness needs:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Identity
2. Execution contracts
3. Tool policies
4. Context engineering
5. Memory discipline
6. Skills
7. Durable state
8. Failure handling
9. Trajectory evals
10. Macro evals
11. Observability
12. Governance
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;If you are a student, learn this early.&lt;/p&gt;

&lt;p&gt;If you are a developer, practice this deliberately.&lt;/p&gt;

&lt;p&gt;If you are building AI products, treat this as infrastructure.&lt;/p&gt;

&lt;p&gt;The best AI developers will not be the ones who only know how to call an API.&lt;/p&gt;

&lt;p&gt;They will be the ones who know how to design the system around the model.&lt;/p&gt;

&lt;p&gt;That is how we move from:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;“This AI agent helps me sometimes.”&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;To:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;“This agentic harness is part of my engineering system.”&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;References&lt;/strong&gt;
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;Anthropic: Demystifying evals for AI agents&lt;/li&gt;
&lt;li&gt;Anthropic: Building effective agents&lt;/li&gt;
&lt;li&gt;OpenAI Cookbook: Building Governed AI Agents&lt;/li&gt;
&lt;li&gt;Anthropic: Effective harnesses for long-running agents&lt;/li&gt;
&lt;li&gt;Anthropic: Effective context engineering for AI agents&lt;/li&gt;
&lt;li&gt;OpenAI Cookbook: Macro Evals for Agentic Systems&lt;/li&gt;
&lt;li&gt;OpenAI Cookbook: Getting started with OpenAI Evals&lt;/li&gt;
&lt;li&gt;Anthropic: How we built our multi-agent research system&lt;/li&gt;
&lt;/ol&gt;

</description>
      <category>ai</category>
      <category>agents</category>
      <category>llm</category>
      <category>agenticharness</category>
    </item>
    <item>
      <title>Personalized Food Recommendation RAG bot on WhatsApp</title>
      <dc:creator>Mrunmayee Rane</dc:creator>
      <pubDate>Sun, 26 Apr 2026 14:01:07 +0000</pubDate>
      <link>https://forem.com/mrunmayee_rane_9d0e22b4de/food-recommendation-app-3llp</link>
      <guid>https://forem.com/mrunmayee_rane_9d0e22b4de/food-recommendation-app-3llp</guid>
      <description>&lt;p&gt;Moving from New York City to the west coast, I found it difficult to decide as to what to eat for my meals. Also it was very challenging to find healthy and good restaurants in California. In New York city, it was easy to pick a spot and cuisine, cause every lane there were already 20–25 good restaurants. Having covered a wide range of restaurants in New York from the best of Chintan Pandya’s Dhamaka to the casual Thai at Up Thai, everything was at a quick walking or few subway stops away, In California, I was in for a surprise.&lt;/p&gt;

&lt;p&gt;Let’s face the fact that finding meal options with personal preferences such as vegan, gluten free, sugar free, pescatarian food and a variety of cuisines is like searching for a needle in a haystack. Having recognized this dilemma and inspired by Nvidia’s LLM developer day. I embarked on a mission to simplify this search.&lt;/p&gt;

&lt;p&gt;Goal? To create a system that understands your craving and points you to the ideal meal.&lt;/p&gt;

&lt;p&gt;Journey began with &lt;a href="https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset" rel="noopener noreferrer"&gt;yelp academic datasets&lt;/a&gt;. Huge goldmine of user reviews and business information. We zeroed it to California, a hub of diverse and vibrant culinary culture and narrowed it down to 20k samples for efficiency.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Leveraging Retrieval-Augmented Generation (RAG) for Personalized Recommendations&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;A key innovation in our system is the incorporation of Retrieval-Augmented Generation (RAG). RAG combines the strengths of both retrieval-based and generative AI models, enabling our system to provide highly accurate and personalized food recommendations. This approach works by first retrieving relevant information from our extensive dataset — in this case, the Yelp academic dataset — which includes a wide range of user reviews and business information. Then, using generative models, RAG synthesizes this information to produce coherent and context-specific recommendations. This method is particularly effective for catering to diverse dietary preferences and cuisines, as it can seamlessly integrate vast amounts of detailed data, including vegan, gluten-free, sugar-free, and pescatarian options. By leveraging RAG, we ensure that our recommendations are not just data-driven but also finely tuned to each user’s unique taste and preferences, truly embodying the essence of a personalized recommendation system.&lt;/p&gt;

&lt;p&gt;Merged business and user reviews dataset, creating a detailed hashmap of businesses.&lt;/p&gt;

&lt;p&gt;This hashmap contained detailed information for each business, including the name, ID, address, city, state, postal code, user reviews, operational hours, and categories — a treasure of information for any foodie. Recognizing the complexity of handling multiple user reviews and ratings for a single business ID, we employed an aggregation method. This approach averaged user ratings and consolidated multiple reviews per business, ensuring a more streamlined dataset. Subsequently, we transformed the hashmap back into a dataframe, and eventually into a CSV file, to facilitate easier referencing and mapping.&lt;/p&gt;

&lt;p&gt;For the creation of embeddings and loading of the entire CSV document, we used langchain.document_loaders.csv_loader. To effectively manage the large volume of data, we divided the document into smaller chunks, enabling efficient processing by the LLM model. The RecursiveCharacterTextSplitter from LangChain was utilized for generic text splitting, ensuring the data was appropriately segmented.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A7N20Dk4RNdzygCP7" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A7N20Dk4RNdzygCP7" width="1024" height="294"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Text Embeddings:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Model path sets the pre-trained model to be used for embeddings which is &lt;em&gt;sentence-transformers/all-MiniLM-l6-v2.&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AvXIV-Gpv1i0YGZJu" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AvXIV-Gpv1i0YGZJu" width="1024" height="205"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It configures and initializes a sentence transformer model from Hugging Face for generating embeddings. It specifically uses the all-MiniLM-l6-v2 model, runs on the CPU, and produces non-normalized embeddings. Normalization is often used to standardize the length of the embedding vectors.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2ALzjxLnVQ_Vet3mWt" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2ALzjxLnVQ_Vet3mWt" width="1024" height="102"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Chroma is a tool used for efficient similarity search and retrieval in large collections of data. It helps when there’s a need to find the most similar items quickly, while having a large number of embeddings. from_documents is a method that creates a Chroma database from a set of documents and their embeddings. embeddings is an object initialized using HuggingFaceEmbeddings. These embeddings are capable of converting text documents into vector embeddings. The embeddings for the docs are generated and used by Chroma to enable efficient similarity searches.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Retrieved Data:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AdT22-ZZ3aSwP8wlQ" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AdT22-ZZ3aSwP8wlQ" width="1024" height="380"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Retriever creates a retriever object from the Chroma database (db), previously initialized.&lt;/p&gt;

&lt;p&gt;as_retriever is a method, transforms the database into a retriever capable of performing search operations. search_type=”mmr” specifies the type of search algorithm used. “mmr” stands for Maximal Marginal Relevance. MMR is used to retrieve diverse results by balancing relevance and diversity, ensuring that the retrieved documents are not just relevant but also varied. get_relevant_documents is a method that takes a query and returns a list of documents that are most relevant to the query. num_results=7 specifies the number of results to return.It’s set to retrieve the top 7 relevant documents.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A54XBZGM5_2c3SICF" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A54XBZGM5_2c3SICF" width="1024" height="115"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Above statements save the embeddings in a persistent directory, locally so that it can be easily retrieved when needed.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A0vgKZHNZrs6r7-zb" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A0vgKZHNZrs6r7-zb" width="1024" height="423"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Then it loads the Chroma database for similarity searches and performs a search with a specified query. similarity_search_with_score is a method that searches for documents most similar to the given query based on their embeddings with similarity score and then sort them in descending order for highest ranking.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Prompt Creation:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A7kNvcvZD1Kg7q7I1" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2A7kNvcvZD1Kg7q7I1" width="1024" height="594"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I parsed details about the top 5 retrieved business information with a detailed prompt using the prompt template in langchain. Send this complete prompt to the Llama2–70B or Steerlm Llama 70B model using &lt;a href="https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/llama2-70b/api" rel="noopener noreferrer"&gt;NVIDIA’s Cloud Function(NVCF) API&lt;/a&gt; and generate a response from it.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Integration with whatsapp through Twilio and Ngrok:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Twilio is a powerful platform for communications, enabling us to send and receive messages, make and receive phone calls, and more. In this project, we use Twilio to receive user queries via SMS and respond with food recommendations..&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Setting Up Twilio Account&lt;/strong&gt; :&lt;/p&gt;

&lt;p&gt;First, you need to create a Twilio account and get a phone number that can send and receive SMS messages then obtain your Twilio Account SID, Auth Token, and phone number from the Twilio Console.&lt;/p&gt;

&lt;p&gt;Install the Twilio Python helper library to handle messaging: &lt;em&gt;pip install twilio&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Configure Twilio to Forward Incoming Messages&lt;/strong&gt; :&lt;/p&gt;

&lt;p&gt;In the Twilio Console, configure your Twilio phone number to forward incoming messages to your FastAPI endpoint exposed by ngrok. This is typically done by setting the “Messaging” webhook URL to point to your /recommendation endpoint (e.g., &lt;a href="https://your-domain.com/recommendation" rel="noopener noreferrer"&gt;https://your-domain.com/recommendation).&lt;/a&gt;.)&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Processing Incoming Messages&lt;/strong&gt; :&lt;/p&gt;

&lt;p&gt;In the FastAPI app, we define an endpoint /recommendation that will handle incoming POST requests from Twilio. Twilio sends incoming messages to this endpoint. When a message is received, the content of the message is extracted and passed to the generate_answer function, which generates the food recommendation based on the user’s query. The response is then wrapped in a Twilio MessagingResponse object and sent back to the user.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AQeSdVww-sbSlp9Tu" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AQeSdVww-sbSlp9Tu" width="1024" height="452"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Setting Up Webhooks in Twilio:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;To complete the integration, you need to set up a webhook in Twilio to point to your FastAPI endpoint exposed by ngrok. Here’s how you can do it:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Log in to your Twilio Console.&lt;/li&gt;
&lt;li&gt;Navigate to the “Phone Numbers” section and select the number you want to use.&lt;/li&gt;
&lt;li&gt;In the “Messaging” section, set the “A Message Comes In” webhook to your ngrok URL, e.g., &lt;a href="https://your-domain.com/recommendation." rel="noopener noreferrer"&gt;https://your-domain.com/recommendation.&lt;/a&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;Why Use Ngrok?&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Ngrok is a tool that creates a secure tunnel to your localhost, allowing you to expose a local server to the internet. When developing locally, your FastAPI application runs on localhost, which is not accessible from the internet. Twilio needs a publicly accessible URL to send webhook requests to your /recommendation endpoint. Ngrok provides this by tunneling requests from a public URL to your local development server.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Setting Up Ngrok:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Install Ngrok by using _pip install pyngrok _command&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Sign Up and Configure Ngrok&lt;/strong&gt; :&lt;/p&gt;

&lt;p&gt;Sign up for a free account on the Ngrok website to get your authentication token. After signing up, you will receive an authentication token which you need to configure Ngrok. Use the following command to add your auth token. “&lt;em&gt;ngrok authtoken YOUR_AUTH_TOKEN&lt;/em&gt;”&lt;/p&gt;

&lt;p&gt;First, run your FastAPI application on your local machine then Ngrok by opening a new terminal. “&lt;em&gt;ngrok http 5000&lt;/em&gt;” After starting Ngrok, you will see the forwarding link which we will use to configure the Twilio webhook.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Update Twilio Webhook&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Set the “A Message Comes In” webhook in messaging section to your ngrok public URL followed by the /recommendation endpoint.&lt;/p&gt;

&lt;p&gt;Technologies behind these tastes were langchain, hugging face, pandas, chroma for vector storage and streamlit for user interface and steerlm Llama 70B model through NVIDIA’s Cloud Function(NVCF).&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fl68g4qw8wnr5ydoe55g0.gif" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fl68g4qw8wnr5ydoe55g0.gif" width="8" height="17"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What’s Next: Enhancing and Expanding:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;My vision includes integrating user feedback mechanisms, map functionalities, and personalized dietary preferences into the system. We also plan to evaluate our method with a larger dataset not limited to California, refining approach for even better accuracy.&lt;/p&gt;

&lt;p&gt;Happy to connect on &lt;a href="https://www.linkedin.com/in/mrunmayeerane/" rel="noopener noreferrer"&gt;LinkedIn&lt;/a&gt;!&lt;/p&gt;

</description>
      <category>rags</category>
      <category>nvidia</category>
      <category>llm</category>
      <category>llmapplications</category>
    </item>
    <item>
      <title>Building a Multi Agent Career and Workplace Assistant at Stanford Hackathon</title>
      <dc:creator>Mrunmayee Rane</dc:creator>
      <pubDate>Tue, 28 Jan 2025 06:35:46 +0000</pubDate>
      <link>https://forem.com/mrunmayee_rane_9d0e22b4de/building-a-multi-agent-career-and-workplace-assistant-at-stanford-hackathon-2gj4</link>
      <guid>https://forem.com/mrunmayee_rane_9d0e22b4de/building-a-multi-agent-career-and-workplace-assistant-at-stanford-hackathon-2gj4</guid>
      <description>&lt;p&gt;I participated in my first Hackathon for Women in AI at Stanford University, organized by Twelve Labs, Zilliz, GenAI Collective, and Women Who Do Data (W2D2). One of the key insights I gained is that integrity is the most underrated value in today’s workplaces, often leading to conflicts and misunderstandings when overlooked.&lt;/p&gt;

&lt;p&gt;In today’s fast-paced professional landscape, employees face numerous challenges, ranging from workplace stress to navigating career transitions. Despite the availability of HR teams, training programs, and mentorship opportunities, there remains a significant gap in providing real-time, personalized, and scalable guidance.&lt;/p&gt;

&lt;p&gt;This introduces the &lt;strong&gt;Multi Agent Career and Workplace Assistant&lt;/strong&gt; , an innovative application designed to address this gap. By leveraging AI-powered tools, it delivers personalized career coaching and workplace guidance, empowering employees to overcome challenges and achieve their professional goals.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fmvbpfzn1tn12advh7sv7.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fmvbpfzn1tn12advh7sv7.png" width="800" height="482"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  The Problem
&lt;/h3&gt;

&lt;p&gt;Employees often encounter challenges such as:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Workplace Stress:&lt;/strong&gt; Interpersonal conflicts, unclear communication, or work overload.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Career Transition:&lt;/strong&gt; Navigating skill gaps, identifying the right resources, and making informed career decisions.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Organizations face challenges too:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Scalability:&lt;/strong&gt; Providing mentorship and career coaching to all employees.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Customization:&lt;/strong&gt; Tailoring advice to individual needs.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Engagement:&lt;/strong&gt; Delivering relevant, on-demand learning resources.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  The Solution: Multi-Agent Career and Workplace Assistant
&lt;/h3&gt;

&lt;p&gt;This application uses a multi-agent framework to classify and address employee queries into two categories:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Workplace Stress:&lt;/strong&gt; Provides actionable advice and stress management resources.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Career Transition:&lt;/strong&gt; Generates a structured learning path for transitioning into new fields.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;  &lt;iframe src="https://www.youtube.com/embed/0PYpx34D8FE"&gt;
  &lt;/iframe&gt;
&lt;/p&gt;

&lt;h3&gt;
  
  
  Key Features
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Intent Classification:&lt;/strong&gt; Uses AI to determine whether the query is related to workplace stress or career transition&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Resource Retrieval:&lt;/strong&gt; Embeds and retrieves relevant videos and PDFs using advanced embedding models such as Twelve labs’ Marengo Retriever 2.7 and Hugging Face Sentence transformer/all-MiniLM-L6-V2.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Generative AI Integration:&lt;/strong&gt; Delivers personalized advice using Gemini 2.0 Flash Experiment model.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Scalability:&lt;/strong&gt; Supports multiple users with minimal human intervention.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Technology Stack
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;LangChain&lt;/strong&gt; : For embedding and managing document embeddings.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Twelve Labs&lt;/strong&gt; : For video embeddings and processing.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Zilliz/Milvus&lt;/strong&gt; : To store and retrieve vector embeddings efficiently.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Streamlit&lt;/strong&gt; : For a user-friendly interface.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Google Generative AI&lt;/strong&gt; : To generate natural language responses using Gemini 2.0 Flash Experiment model.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;PyPDF2&lt;/strong&gt; : For PDF parsing and text extraction.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;How does it Work?&lt;/strong&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  Step 1: Intent Classification
&lt;/h3&gt;

&lt;p&gt;The assistant classifies user queries into two categories:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Career Transition:&lt;/strong&gt; Queries about learning new skills or exploring new career paths.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Workplace Stress:&lt;/strong&gt; Queries about workplace conflicts, communication issues, or stress management.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AlSKMikGZslhLNeTb" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AlSKMikGZslhLNeTb" width="1024" height="585"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  Step 2: Resource Embedding and Retrieval
&lt;/h3&gt;

&lt;p&gt;The application preprocesses and embeds PDFs and videos into Zilliz/Milvus, enabling fast and efficient similarity searches.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Embedding PDFs:&lt;/strong&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2APWz_Ls4kgRmJRnd5" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2APWz_Ls4kgRmJRnd5" width="1024" height="756"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Embedding Videos:&lt;/strong&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2Asxt_L_dT0YWLvRIe" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2Asxt_L_dT0YWLvRIe" width="1024" height="893"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Similarity Search:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AtLpZmme3_Rv1-w3K" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AtLpZmme3_Rv1-w3K" width="1024" height="489"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  Step 3: Creating System Prompt
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;Based on the classified intent, the assistant queries the embedding database for relevant resources and uses Generative AI to provide personalized recommendations.&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AwrwU4nx4EMnZNk1Q" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AwrwU4nx4EMnZNk1Q" width="1024" height="778"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  Step 4: Streamlit UI
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;The application uses Streamlit for an intuitive UI where users input queries and receive personalized advice. The retrieved PDFs and videos are displayed with thumbnails and clickable links.&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AiXLjtV6JQqem8DbT" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F1024%2F0%2AiXLjtV6JQqem8DbT" width="1024" height="748"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  Conclusion
&lt;/h3&gt;

&lt;p&gt;The Multi-Agent Career and Workplace Assistant bridges the gap between employees and scalable, personalized mentorship. By leveraging state-of-the-art AI tools, it provides timely and actionable guidance, ensuring both employees and organizations thrive in today’s dynamic professional environments.&lt;/p&gt;

&lt;p&gt;Github: &lt;a href="https://github.com/mrunmayee17/Career_and_Workplace_Assistant" rel="noopener noreferrer"&gt;https://github.com/mrunmayee17/Career_and_Workplace_Assistant&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Happy to connect on &lt;a href="https://www.linkedin.com/in/mrunmayeerane/" rel="noopener noreferrer"&gt;LinkedIn&lt;/a&gt;!&lt;/p&gt;

</description>
      <category>hackathon</category>
      <category>milvus</category>
      <category>womeninai</category>
      <category>gemini</category>
    </item>
  </channel>
</rss>
