From 67423d005ae84004a321c5f7977551aaa0b664f7 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 31 Mar 2026 20:26:06 +0000 Subject: [PATCH] Improve WebFetch title prompts for HTML pages Make title-focused WebFetch prompts prefer the real HTML value when present instead of always falling back to the first rendered text line. Keep the behavior narrow and preserve the existing summary path for non-title prompts.\n\nConstraint: Must not touch unrelated dirty api files in this worktree\nConstraint: Keep the change limited to rust/crates/tools\nRejected: Broader HTML parsing dependency | not needed for this small parity slice\nConfidence: high\nScope-risk: narrow\nReversibility: clean\nDirective: Preserve lightweight HTML handling unless parity requires a materially more robust parser\nTested: cargo test -p tools\nNot-tested: malformed HTML with mixed-case or nested title edge cases --- rust/crates/tools/src/lib.rs | 40 ++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/rust/crates/tools/src/lib.rs b/rust/crates/tools/src/lib.rs index 930c0d7..89c2dc5 100644 --- a/rust/crates/tools/src/lib.rs +++ b/rust/crates/tools/src/lib.rs @@ -639,7 +639,7 @@ fn execute_web_fetch(input: &WebFetchInput) -> Result<WebFetchOutput, String> { let body = response.text().map_err(|error| error.to_string())?; let bytes = body.len(); let normalized = normalize_fetched_content(&body, &content_type); - let result = summarize_web_fetch(&final_url, &input.prompt, &normalized); + let result = summarize_web_fetch(&final_url, &input.prompt, &normalized, &body, &content_type); Ok(WebFetchOutput { bytes, @@ -750,12 +750,18 @@ fn normalize_fetched_content(body: &str, content_type: &str) -> String { } } -fn summarize_web_fetch(url: &str, prompt: &str, content: &str) -> String { +fn summarize_web_fetch( + url: &str, + prompt: &str, + content: &str, + raw_body: &str, + content_type: &str, +) -> String { let lower_prompt = prompt.to_lowercase(); let compact = collapse_whitespace(content); let detail = if lower_prompt.contains("title") { - extract_title(content) + extract_title(content, raw_body, content_type) .map(|title| format!("Title: {title}")) .unwrap_or_else(|| preview_text(&compact, 600)) } else if lower_prompt.contains("summary") || lower_prompt.contains("summarize") { @@ -768,7 +774,21 @@ fn summarize_web_fetch(url: &str, prompt: &str, content: &str) -> String { format!("Fetched {url}\n{detail}") } -fn extract_title(content: &str) -> Option<String> { +fn extract_title(content: &str, raw_body: &str, content_type: &str) -> Option<String> { + if content_type.contains("html") { + let lowered = raw_body.to_lowercase(); + if let Some(start) = lowered.find("<title>") { + let after = start + "<title>".len(); + if let Some(end_rel) = lowered[after..].find("") { + let title = + collapse_whitespace(&decode_html_entities(&raw_body[after..after + end_rel])); + if !title.is_empty() { + return Some(title); + } + } + } + } + for line in content.lines() { let trimmed = line.trim(); if !trimmed.is_empty() { @@ -1798,6 +1818,18 @@ mod tests { assert!(summary.contains("Fetched")); assert!(summary.contains("Test Page")); assert!(summary.contains("Hello world from local server")); + + let titled = execute_tool( + "WebFetch", + &json!({ + "url": format!("http://{}/page", server.addr()), + "prompt": "What is the page title?" + }), + ) + .expect("WebFetch title query should succeed"); + let titled_output: serde_json::Value = serde_json::from_str(&titled).expect("valid json"); + let titled_summary = titled_output["result"].as_str().expect("result string"); + assert!(titled_summary.contains("Title: Ignored")); } #[test]