feat(tools): add WebFetch and WebSearch parity primitives

Implement the first web-oriented Claude Code parity slice in the Rust tools crate. This adds concrete WebFetch and WebSearch tool specs, execution paths, lightweight HTML/search-result extraction, domain filtering, and local HTTP-backed tests while leaving the existing core file and shell tools intact.\n\nConstraint: Keep the change scoped to tools-only Rust workspace code\nConstraint: Match Claude Code tool names and JSON schemas closely enough for parity work\nRejected: Stub-only tool registrations | would not materially expand beyond MVP\nRejected: Full browser/search service integration | too large for this first logical slice\nConfidence: medium\nScope-risk: moderate\nReversibility: clean\nDirective: Treat these web helpers as a parity foundation; refine result quality without renaming the exposed tool contracts\nTested: cargo fmt; cargo test -p tools\nNot-tested: cargo clippy; full workspace cargo test
This commit is contained in:
Yeachan-Heo
2026-03-31 19:15:05 +00:00
parent 4586764a0e
commit 5b106b840d
3 changed files with 637 additions and 1 deletions

View File

@@ -1,8 +1,12 @@
use std::collections::BTreeSet;
use std::time::{Duration, Instant};
use reqwest::blocking::Client;
use runtime::{
edit_file, execute_bash, glob_search, grep_search, read_file, write_file, BashCommandInput,
GrepSearchInput,
};
use serde::Deserialize;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
#[derive(Debug, Clone, PartialEq, Eq)]
@@ -140,6 +144,40 @@ pub fn mvp_tool_specs() -> Vec<ToolSpec> {
"additionalProperties": false
}),
},
ToolSpec {
name: "WebFetch",
description:
"Fetch a URL, convert it into readable text, and answer a prompt about it.",
input_schema: json!({
"type": "object",
"properties": {
"url": { "type": "string", "format": "uri" },
"prompt": { "type": "string" }
},
"required": ["url", "prompt"],
"additionalProperties": false
}),
},
ToolSpec {
name: "WebSearch",
description: "Search the web for current information and return cited results.",
input_schema: json!({
"type": "object",
"properties": {
"query": { "type": "string", "minLength": 2 },
"allowed_domains": {
"type": "array",
"items": { "type": "string" }
},
"blocked_domains": {
"type": "array",
"items": { "type": "string" }
}
},
"required": ["query"],
"additionalProperties": false
}),
},
]
}
@@ -151,6 +189,8 @@ pub fn execute_tool(name: &str, input: &Value) -> Result<String, String> {
"edit_file" => from_value::<EditFileInput>(input).and_then(run_edit_file),
"glob_search" => from_value::<GlobSearchInputValue>(input).and_then(run_glob_search),
"grep_search" => from_value::<GrepSearchInput>(input).and_then(run_grep_search),
"WebFetch" => from_value::<WebFetchInput>(input).and_then(run_web_fetch),
"WebSearch" => from_value::<WebSearchInput>(input).and_then(run_web_search),
_ => Err(format!("unsupported tool: {name}")),
}
}
@@ -192,6 +232,14 @@ fn run_grep_search(input: GrepSearchInput) -> Result<String, String> {
to_pretty_json(grep_search(&input).map_err(io_to_string)?)
}
fn run_web_fetch(input: WebFetchInput) -> Result<String, String> {
to_pretty_json(execute_web_fetch(&input)?)
}
fn run_web_search(input: WebSearchInput) -> Result<String, String> {
to_pretty_json(execute_web_search(&input)?)
}
fn to_pretty_json<T: serde::Serialize>(value: T) -> Result<String, String> {
serde_json::to_string_pretty(&value).map_err(|error| error.to_string())
}
@@ -227,8 +275,411 @@ struct GlobSearchInputValue {
path: Option<String>,
}
#[derive(Debug, Deserialize)]
struct WebFetchInput {
url: String,
prompt: String,
}
#[derive(Debug, Deserialize)]
struct WebSearchInput {
query: String,
allowed_domains: Option<Vec<String>>,
blocked_domains: Option<Vec<String>>,
}
#[derive(Debug, Serialize)]
struct WebFetchOutput {
bytes: usize,
code: u16,
#[serde(rename = "codeText")]
code_text: String,
result: String,
#[serde(rename = "durationMs")]
duration_ms: u128,
url: String,
}
#[derive(Debug, Serialize)]
struct WebSearchOutput {
query: String,
results: Vec<WebSearchResultItem>,
#[serde(rename = "durationSeconds")]
duration_seconds: f64,
}
#[derive(Debug, Serialize)]
#[serde(untagged)]
enum WebSearchResultItem {
SearchResult {
tool_use_id: String,
content: Vec<SearchHit>,
},
Commentary(String),
}
#[derive(Debug, Serialize)]
struct SearchHit {
title: String,
url: String,
}
fn execute_web_fetch(input: &WebFetchInput) -> Result<WebFetchOutput, String> {
let started = Instant::now();
let client = build_http_client()?;
let request_url = normalize_fetch_url(&input.url)?;
let response = client
.get(request_url.clone())
.send()
.map_err(|error| error.to_string())?;
let status = response.status();
let final_url = response.url().to_string();
let code = status.as_u16();
let code_text = status.canonical_reason().unwrap_or("Unknown").to_string();
let content_type = response
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|value| value.to_str().ok())
.unwrap_or_default()
.to_string();
let body = response.text().map_err(|error| error.to_string())?;
let bytes = body.len();
let normalized = normalize_fetched_content(&body, &content_type);
let result = summarize_web_fetch(&final_url, &input.prompt, &normalized);
Ok(WebFetchOutput {
bytes,
code,
code_text,
result,
duration_ms: started.elapsed().as_millis(),
url: final_url,
})
}
fn execute_web_search(input: &WebSearchInput) -> Result<WebSearchOutput, String> {
let started = Instant::now();
let client = build_http_client()?;
let search_url = build_search_url(&input.query)?;
let response = client
.get(search_url)
.send()
.map_err(|error| error.to_string())?;
let final_url = response.url().clone();
let html = response.text().map_err(|error| error.to_string())?;
let mut hits = extract_search_hits(&html);
if hits.is_empty() && final_url.host_str().is_some() {
hits = extract_search_hits_from_generic_links(&html);
}
if let Some(allowed) = input.allowed_domains.as_ref() {
hits.retain(|hit| host_matches_list(&hit.url, allowed));
}
if let Some(blocked) = input.blocked_domains.as_ref() {
hits.retain(|hit| !host_matches_list(&hit.url, blocked));
}
dedupe_hits(&mut hits);
hits.truncate(8);
let summary = if hits.is_empty() {
format!("No web search results matched the query {:?}.", input.query)
} else {
let rendered_hits = hits
.iter()
.map(|hit| format!("- [{}]({})", hit.title, hit.url))
.collect::<Vec<_>>()
.join("\n");
format!(
"Search results for {:?}. Include a Sources section in the final answer.\n{}",
input.query, rendered_hits
)
};
Ok(WebSearchOutput {
query: input.query.clone(),
results: vec![
WebSearchResultItem::Commentary(summary),
WebSearchResultItem::SearchResult {
tool_use_id: String::from("web_search_1"),
content: hits,
},
],
duration_seconds: started.elapsed().as_secs_f64(),
})
}
fn build_http_client() -> Result<Client, String> {
Client::builder()
.timeout(Duration::from_secs(20))
.redirect(reqwest::redirect::Policy::limited(10))
.user_agent("clawd-rust-tools/0.1")
.build()
.map_err(|error| error.to_string())
}
fn normalize_fetch_url(url: &str) -> Result<String, String> {
let parsed = reqwest::Url::parse(url).map_err(|error| error.to_string())?;
if parsed.scheme() == "http" {
let host = parsed.host_str().unwrap_or_default();
if host != "localhost" && host != "127.0.0.1" && host != "::1" {
let mut upgraded = parsed;
upgraded
.set_scheme("https")
.map_err(|_| String::from("failed to upgrade URL to https"))?;
return Ok(upgraded.to_string());
}
}
Ok(parsed.to_string())
}
fn build_search_url(query: &str) -> Result<reqwest::Url, String> {
if let Ok(base) = std::env::var("CLAWD_WEB_SEARCH_BASE_URL") {
let mut url = reqwest::Url::parse(&base).map_err(|error| error.to_string())?;
url.query_pairs_mut().append_pair("q", query);
return Ok(url);
}
let mut url = reqwest::Url::parse("https://html.duckduckgo.com/html/")
.map_err(|error| error.to_string())?;
url.query_pairs_mut().append_pair("q", query);
Ok(url)
}
fn normalize_fetched_content(body: &str, content_type: &str) -> String {
if content_type.contains("html") {
html_to_text(body)
} else {
body.trim().to_string()
}
}
fn summarize_web_fetch(url: &str, prompt: &str, content: &str) -> String {
let lower_prompt = prompt.to_lowercase();
let compact = collapse_whitespace(content);
let detail = if lower_prompt.contains("title") {
extract_title(content)
.map(|title| format!("Title: {title}"))
.unwrap_or_else(|| preview_text(&compact, 600))
} else if lower_prompt.contains("summary") || lower_prompt.contains("summarize") {
preview_text(&compact, 900)
} else {
let preview = preview_text(&compact, 900);
format!("Prompt: {prompt}\nContent preview:\n{preview}")
};
format!("Fetched {url}\n{detail}")
}
fn extract_title(content: &str) -> Option<String> {
for line in content.lines() {
let trimmed = line.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
fn html_to_text(html: &str) -> String {
let mut text = String::with_capacity(html.len());
let mut in_tag = false;
let mut previous_was_space = false;
for ch in html.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
_ if in_tag => {}
'&' => {
text.push('&');
previous_was_space = false;
}
ch if ch.is_whitespace() => {
if !previous_was_space {
text.push(' ');
previous_was_space = true;
}
}
_ => {
text.push(ch);
previous_was_space = false;
}
}
}
collapse_whitespace(&decode_html_entities(&text))
}
fn decode_html_entities(input: &str) -> String {
input
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&nbsp;", " ")
}
fn collapse_whitespace(input: &str) -> String {
input.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn preview_text(input: &str, max_chars: usize) -> String {
if input.chars().count() <= max_chars {
return input.to_string();
}
let shortened = input.chars().take(max_chars).collect::<String>();
format!("{}", shortened.trim_end())
}
fn extract_search_hits(html: &str) -> Vec<SearchHit> {
let mut hits = Vec::new();
let mut remaining = html;
while let Some(anchor_start) = remaining.find("result__a") {
let after_class = &remaining[anchor_start..];
let Some(href_idx) = after_class.find("href=") else {
remaining = &after_class[1..];
continue;
};
let href_slice = &after_class[href_idx + 5..];
let Some((url, rest)) = extract_quoted_value(href_slice) else {
remaining = &after_class[1..];
continue;
};
let Some(close_tag_idx) = rest.find('>') else {
remaining = &after_class[1..];
continue;
};
let after_tag = &rest[close_tag_idx + 1..];
let Some(end_anchor_idx) = after_tag.find("</a>") else {
remaining = &after_tag[1..];
continue;
};
let title = html_to_text(&after_tag[..end_anchor_idx]);
if let Some(decoded_url) = decode_duckduckgo_redirect(&url) {
hits.push(SearchHit {
title: title.trim().to_string(),
url: decoded_url,
});
}
remaining = &after_tag[end_anchor_idx + 4..];
}
hits
}
fn extract_search_hits_from_generic_links(html: &str) -> Vec<SearchHit> {
let mut hits = Vec::new();
let mut remaining = html;
while let Some(anchor_start) = remaining.find("<a") {
let after_anchor = &remaining[anchor_start..];
let Some(href_idx) = after_anchor.find("href=") else {
remaining = &after_anchor[2..];
continue;
};
let href_slice = &after_anchor[href_idx + 5..];
let Some((url, rest)) = extract_quoted_value(href_slice) else {
remaining = &after_anchor[2..];
continue;
};
let Some(close_tag_idx) = rest.find('>') else {
remaining = &after_anchor[2..];
continue;
};
let after_tag = &rest[close_tag_idx + 1..];
let Some(end_anchor_idx) = after_tag.find("</a>") else {
remaining = &after_anchor[2..];
continue;
};
let title = html_to_text(&after_tag[..end_anchor_idx]);
if title.trim().is_empty() {
remaining = &after_tag[end_anchor_idx + 4..];
continue;
}
let decoded_url = decode_duckduckgo_redirect(&url).unwrap_or(url);
if decoded_url.starts_with("http://") || decoded_url.starts_with("https://") {
hits.push(SearchHit {
title: title.trim().to_string(),
url: decoded_url,
});
}
remaining = &after_tag[end_anchor_idx + 4..];
}
hits
}
fn extract_quoted_value(input: &str) -> Option<(String, &str)> {
let quote = input.chars().next()?;
if quote != '"' && quote != '\'' {
return None;
}
let rest = &input[quote.len_utf8()..];
let end = rest.find(quote)?;
Some((rest[..end].to_string(), &rest[end + quote.len_utf8()..]))
}
fn decode_duckduckgo_redirect(url: &str) -> Option<String> {
if url.starts_with("http://") || url.starts_with("https://") {
return Some(html_entity_decode_url(url));
}
let joined = if url.starts_with("//") {
format!("https:{url}")
} else if url.starts_with('/') {
format!("https://duckduckgo.com{url}")
} else {
return None;
};
let parsed = reqwest::Url::parse(&joined).ok()?;
if parsed.path() == "/l/" || parsed.path() == "/l" {
for (key, value) in parsed.query_pairs() {
if key == "uddg" {
return Some(html_entity_decode_url(value.as_ref()));
}
}
}
Some(joined)
}
fn html_entity_decode_url(url: &str) -> String {
decode_html_entities(url)
}
fn host_matches_list(url: &str, domains: &[String]) -> bool {
let Ok(parsed) = reqwest::Url::parse(url) else {
return false;
};
let Some(host) = parsed.host_str() else {
return false;
};
domains.iter().any(|domain| {
let normalized = domain.trim().trim_start_matches('.');
host == normalized || host.ends_with(&format!(".{normalized}"))
})
}
fn dedupe_hits(hits: &mut Vec<SearchHit>) {
let mut seen = BTreeSet::new();
hits.retain(|hit| seen.insert(hit.url.clone()));
}
#[cfg(test)]
mod tests {
use std::io::{Read, Write};
use std::net::{SocketAddr, TcpListener};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use super::{execute_tool, mvp_tool_specs};
use serde_json::json;
@@ -240,6 +691,8 @@ mod tests {
.collect::<Vec<_>>();
assert!(names.contains(&"bash"));
assert!(names.contains(&"read_file"));
assert!(names.contains(&"WebFetch"));
assert!(names.contains(&"WebSearch"));
}
#[test]
@@ -247,4 +700,167 @@ mod tests {
let error = execute_tool("nope", &json!({})).expect_err("tool should be rejected");
assert!(error.contains("unsupported tool"));
}
#[test]
fn web_fetch_returns_prompt_aware_summary() {
let server = TestServer::spawn(Arc::new(|request_line: &str| {
assert!(request_line.starts_with("GET /page "));
HttpResponse::html(
200,
"OK",
"<html><head><title>Ignored</title></head><body><h1>Test Page</h1><p>Hello <b>world</b> from local server.</p></body></html>",
)
}));
let result = execute_tool(
"WebFetch",
&json!({
"url": format!("http://{}/page", server.addr()),
"prompt": "Summarize this page"
}),
)
.expect("WebFetch should succeed");
let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
assert_eq!(output["code"], 200);
let summary = output["result"].as_str().expect("result string");
assert!(summary.contains("Fetched"));
assert!(summary.contains("Test Page"));
assert!(summary.contains("Hello world from local server"));
}
#[test]
fn web_search_extracts_and_filters_results() {
let server = TestServer::spawn(Arc::new(|request_line: &str| {
assert!(request_line.contains("GET /search?q=rust+web+search "));
HttpResponse::html(
200,
"OK",
r#"
<html><body>
<a class="result__a" href="https://docs.rs/reqwest">Reqwest docs</a>
<a class="result__a" href="https://example.com/blocked">Blocked result</a>
</body></html>
"#,
)
}));
std::env::set_var(
"CLAWD_WEB_SEARCH_BASE_URL",
format!("http://{}/search", server.addr()),
);
let result = execute_tool(
"WebSearch",
&json!({
"query": "rust web search",
"allowed_domains": ["docs.rs"],
"blocked_domains": ["example.com"]
}),
)
.expect("WebSearch should succeed");
std::env::remove_var("CLAWD_WEB_SEARCH_BASE_URL");
let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
assert_eq!(output["query"], "rust web search");
let results = output["results"].as_array().expect("results array");
let search_result = results
.iter()
.find(|item| item.get("content").is_some())
.expect("search result block present");
let content = search_result["content"].as_array().expect("content array");
assert_eq!(content.len(), 1);
assert_eq!(content[0]["title"], "Reqwest docs");
assert_eq!(content[0]["url"], "https://docs.rs/reqwest");
}
struct TestServer {
addr: SocketAddr,
shutdown: Option<std::sync::mpsc::Sender<()>>,
handle: Option<thread::JoinHandle<()>>,
}
impl TestServer {
fn spawn(handler: Arc<dyn Fn(&str) -> HttpResponse + Send + Sync + 'static>) -> Self {
let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
listener
.set_nonblocking(true)
.expect("set nonblocking listener");
let addr = listener.local_addr().expect("local addr");
let (tx, rx) = std::sync::mpsc::channel::<()>();
let handle = thread::spawn(move || loop {
if rx.try_recv().is_ok() {
break;
}
match listener.accept() {
Ok((mut stream, _)) => {
let mut buffer = [0_u8; 4096];
let size = stream.read(&mut buffer).expect("read request");
let request = String::from_utf8_lossy(&buffer[..size]).into_owned();
let request_line = request.lines().next().unwrap_or_default().to_string();
let response = handler(&request_line);
stream
.write_all(response.to_bytes().as_slice())
.expect("write response");
}
Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
thread::sleep(Duration::from_millis(10));
}
Err(error) => panic!("server accept failed: {error}"),
}
});
Self {
addr,
shutdown: Some(tx),
handle: Some(handle),
}
}
fn addr(&self) -> SocketAddr {
self.addr
}
}
impl Drop for TestServer {
fn drop(&mut self) {
if let Some(tx) = self.shutdown.take() {
let _ = tx.send(());
}
if let Some(handle) = self.handle.take() {
handle.join().expect("join test server");
}
}
}
struct HttpResponse {
status: u16,
reason: &'static str,
content_type: &'static str,
body: String,
}
impl HttpResponse {
fn html(status: u16, reason: &'static str, body: &str) -> Self {
Self {
status,
reason,
content_type: "text/html; charset=utf-8",
body: body.to_string(),
}
}
fn to_bytes(&self) -> Vec<u8> {
format!(
"HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
self.status,
self.reason,
self.content_type,
self.body.len(),
self.body
)
.into_bytes()
}
}
}