feat(tools): add WebFetch and WebSearch parity primitives

Implement the first web-oriented Claude Code parity slice in the Rust tools crate. This adds concrete WebFetch and WebSearch tool specs, execution paths, lightweight HTML/search-result extraction, domain filtering, and local HTTP-backed tests while leaving the existing core file and shell tools intact.\n\nConstraint: Keep the change scoped to tools-only Rust workspace code\nConstraint: Match Claude Code tool names and JSON schemas closely enough for parity work\nRejected: Stub-only tool registrations | would not materially expand beyond MVP\nRejected: Full browser/search service integration | too large for this first logical slice\nConfidence: medium\nScope-risk: moderate\nReversibility: clean\nDirective: Treat these web helpers as a parity foundation; refine result quality without renaming the exposed tool contracts\nTested: cargo fmt; cargo test -p tools\nNot-tested: cargo clippy; full workspace cargo test
2026-05-17 23:41:38 +08:00 · 2026-03-31 19:15:05 +00:00
parent 4586764a0e
commit 5b106b840d
3 changed files with 637 additions and 1 deletions
--- a/rust/crates/tools/src/lib.rs
+++ b/rust/crates/tools/src/lib.rs
@@ -1,8 +1,12 @@
+use std::collections::BTreeSet;
+use std::time::{Duration, Instant};
+
+use reqwest::blocking::Client;
 use runtime::{
    edit_file, execute_bash, glob_search, grep_search, read_file, write_file, BashCommandInput,
    GrepSearchInput,
 };
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
 use serde_json::{json, Value};

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -140,6 +144,40 @@ pub fn mvp_tool_specs() -> Vec<ToolSpec> {
                "additionalProperties": false
            }),
        },
+        ToolSpec {
+            name: "WebFetch",
+            description:
+                "Fetch a URL, convert it into readable text, and answer a prompt about it.",
+            input_schema: json!({
+                "type": "object",
+                "properties": {
+                    "url": { "type": "string", "format": "uri" },
+                    "prompt": { "type": "string" }
+                },
+                "required": ["url", "prompt"],
+                "additionalProperties": false
+            }),
+        },
+        ToolSpec {
+            name: "WebSearch",
+            description: "Search the web for current information and return cited results.",
+            input_schema: json!({
+                "type": "object",
+                "properties": {
+                    "query": { "type": "string", "minLength": 2 },
+                    "allowed_domains": {
+                        "type": "array",
+                        "items": { "type": "string" }
+                    },
+                    "blocked_domains": {
+                        "type": "array",
+                        "items": { "type": "string" }
+                    }
+                },
+                "required": ["query"],
+                "additionalProperties": false
+            }),
+        },
    ]
 }

@@ -151,6 +189,8 @@ pub fn execute_tool(name: &str, input: &Value) -> Result<String, String> {
        "edit_file" => from_value::<EditFileInput>(input).and_then(run_edit_file),
        "glob_search" => from_value::<GlobSearchInputValue>(input).and_then(run_glob_search),
        "grep_search" => from_value::<GrepSearchInput>(input).and_then(run_grep_search),
+        "WebFetch" => from_value::<WebFetchInput>(input).and_then(run_web_fetch),
+        "WebSearch" => from_value::<WebSearchInput>(input).and_then(run_web_search),
        _ => Err(format!("unsupported tool: {name}")),
    }
 }
@@ -192,6 +232,14 @@ fn run_grep_search(input: GrepSearchInput) -> Result<String, String> {
    to_pretty_json(grep_search(&input).map_err(io_to_string)?)
 }

+fn run_web_fetch(input: WebFetchInput) -> Result<String, String> {
+    to_pretty_json(execute_web_fetch(&input)?)
+}
+
+fn run_web_search(input: WebSearchInput) -> Result<String, String> {
+    to_pretty_json(execute_web_search(&input)?)
+}
+
 fn to_pretty_json<T: serde::Serialize>(value: T) -> Result<String, String> {
    serde_json::to_string_pretty(&value).map_err(|error| error.to_string())
 }
@@ -227,8 +275,411 @@ struct GlobSearchInputValue {
    path: Option<String>,
 }

+#[derive(Debug, Deserialize)]
+struct WebFetchInput {
+    url: String,
+    prompt: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct WebSearchInput {
+    query: String,
+    allowed_domains: Option<Vec<String>>,
+    blocked_domains: Option<Vec<String>>,
+}
+
+#[derive(Debug, Serialize)]
+struct WebFetchOutput {
+    bytes: usize,
+    code: u16,
+    #[serde(rename = "codeText")]
+    code_text: String,
+    result: String,
+    #[serde(rename = "durationMs")]
+    duration_ms: u128,
+    url: String,
+}
+
+#[derive(Debug, Serialize)]
+struct WebSearchOutput {
+    query: String,
+    results: Vec<WebSearchResultItem>,
+    #[serde(rename = "durationSeconds")]
+    duration_seconds: f64,
+}
+
+#[derive(Debug, Serialize)]
+#[serde(untagged)]
+enum WebSearchResultItem {
+    SearchResult {
+        tool_use_id: String,
+        content: Vec<SearchHit>,
+    },
+    Commentary(String),
+}
+
+#[derive(Debug, Serialize)]
+struct SearchHit {
+    title: String,
+    url: String,
+}
+
+fn execute_web_fetch(input: &WebFetchInput) -> Result<WebFetchOutput, String> {
+    let started = Instant::now();
+    let client = build_http_client()?;
+    let request_url = normalize_fetch_url(&input.url)?;
+    let response = client
+        .get(request_url.clone())
+        .send()
+        .map_err(|error| error.to_string())?;
+
+    let status = response.status();
+    let final_url = response.url().to_string();
+    let code = status.as_u16();
+    let code_text = status.canonical_reason().unwrap_or("Unknown").to_string();
+    let content_type = response
+        .headers()
+        .get(reqwest::header::CONTENT_TYPE)
+        .and_then(|value| value.to_str().ok())
+        .unwrap_or_default()
+        .to_string();
+    let body = response.text().map_err(|error| error.to_string())?;
+    let bytes = body.len();
+    let normalized = normalize_fetched_content(&body, &content_type);
+    let result = summarize_web_fetch(&final_url, &input.prompt, &normalized);
+
+    Ok(WebFetchOutput {
+        bytes,
+        code,
+        code_text,
+        result,
+        duration_ms: started.elapsed().as_millis(),
+        url: final_url,
+    })
+}
+
+fn execute_web_search(input: &WebSearchInput) -> Result<WebSearchOutput, String> {
+    let started = Instant::now();
+    let client = build_http_client()?;
+    let search_url = build_search_url(&input.query)?;
+    let response = client
+        .get(search_url)
+        .send()
+        .map_err(|error| error.to_string())?;
+
+    let final_url = response.url().clone();
+    let html = response.text().map_err(|error| error.to_string())?;
+    let mut hits = extract_search_hits(&html);
+
+    if hits.is_empty() && final_url.host_str().is_some() {
+        hits = extract_search_hits_from_generic_links(&html);
+    }
+
+    if let Some(allowed) = input.allowed_domains.as_ref() {
+        hits.retain(|hit| host_matches_list(&hit.url, allowed));
+    }
+    if let Some(blocked) = input.blocked_domains.as_ref() {
+        hits.retain(|hit| !host_matches_list(&hit.url, blocked));
+    }
+
+    dedupe_hits(&mut hits);
+    hits.truncate(8);
+
+    let summary = if hits.is_empty() {
+        format!("No web search results matched the query {:?}.", input.query)
+    } else {
+        let rendered_hits = hits
+            .iter()
+            .map(|hit| format!("- [{}]({})", hit.title, hit.url))
+            .collect::<Vec<_>>()
+            .join("\n");
+        format!(
+            "Search results for {:?}. Include a Sources section in the final answer.\n{}",
+            input.query, rendered_hits
+        )
+    };
+
+    Ok(WebSearchOutput {
+        query: input.query.clone(),
+        results: vec![
+            WebSearchResultItem::Commentary(summary),
+            WebSearchResultItem::SearchResult {
+                tool_use_id: String::from("web_search_1"),
+                content: hits,
+            },
+        ],
+        duration_seconds: started.elapsed().as_secs_f64(),
+    })
+}
+
+fn build_http_client() -> Result<Client, String> {
+    Client::builder()
+        .timeout(Duration::from_secs(20))
+        .redirect(reqwest::redirect::Policy::limited(10))
+        .user_agent("clawd-rust-tools/0.1")
+        .build()
+        .map_err(|error| error.to_string())
+}
+
+fn normalize_fetch_url(url: &str) -> Result<String, String> {
+    let parsed = reqwest::Url::parse(url).map_err(|error| error.to_string())?;
+    if parsed.scheme() == "http" {
+        let host = parsed.host_str().unwrap_or_default();
+        if host != "localhost" && host != "127.0.0.1" && host != "::1" {
+            let mut upgraded = parsed;
+            upgraded
+                .set_scheme("https")
+                .map_err(|_| String::from("failed to upgrade URL to https"))?;
+            return Ok(upgraded.to_string());
+        }
+    }
+    Ok(parsed.to_string())
+}
+
+fn build_search_url(query: &str) -> Result<reqwest::Url, String> {
+    if let Ok(base) = std::env::var("CLAWD_WEB_SEARCH_BASE_URL") {
+        let mut url = reqwest::Url::parse(&base).map_err(|error| error.to_string())?;
+        url.query_pairs_mut().append_pair("q", query);
+        return Ok(url);
+    }
+
+    let mut url = reqwest::Url::parse("https://html.duckduckgo.com/html/")
+        .map_err(|error| error.to_string())?;
+    url.query_pairs_mut().append_pair("q", query);
+    Ok(url)
+}
+
+fn normalize_fetched_content(body: &str, content_type: &str) -> String {
+    if content_type.contains("html") {
+        html_to_text(body)
+    } else {
+        body.trim().to_string()
+    }
+}
+
+fn summarize_web_fetch(url: &str, prompt: &str, content: &str) -> String {
+    let lower_prompt = prompt.to_lowercase();
+    let compact = collapse_whitespace(content);
+
+    let detail = if lower_prompt.contains("title") {
+        extract_title(content)
+            .map(|title| format!("Title: {title}"))
+            .unwrap_or_else(|| preview_text(&compact, 600))
+    } else if lower_prompt.contains("summary") || lower_prompt.contains("summarize") {
+        preview_text(&compact, 900)
+    } else {
+        let preview = preview_text(&compact, 900);
+        format!("Prompt: {prompt}\nContent preview:\n{preview}")
+    };
+
+    format!("Fetched {url}\n{detail}")
+}
+
+fn extract_title(content: &str) -> Option<String> {
+    for line in content.lines() {
+        let trimmed = line.trim();
+        if !trimmed.is_empty() {
+            return Some(trimmed.to_string());
+        }
+    }
+    None
+}
+
+fn html_to_text(html: &str) -> String {
+    let mut text = String::with_capacity(html.len());
+    let mut in_tag = false;
+    let mut previous_was_space = false;
+
+    for ch in html.chars() {
+        match ch {
+            '<' => in_tag = true,
+            '>' => in_tag = false,
+            _ if in_tag => {}
+            '&' => {
+                text.push('&');
+                previous_was_space = false;
+            }
+            ch if ch.is_whitespace() => {
+                if !previous_was_space {
+                    text.push(' ');
+                    previous_was_space = true;
+                }
+            }
+            _ => {
+                text.push(ch);
+                previous_was_space = false;
+            }
+        }
+    }
+
+    collapse_whitespace(&decode_html_entities(&text))
+}
+
+fn decode_html_entities(input: &str) -> String {
+    input
+        .replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&nbsp;", " ")
+}
+
+fn collapse_whitespace(input: &str) -> String {
+    input.split_whitespace().collect::<Vec<_>>().join(" ")
+}
+
+fn preview_text(input: &str, max_chars: usize) -> String {
+    if input.chars().count() <= max_chars {
+        return input.to_string();
+    }
+    let shortened = input.chars().take(max_chars).collect::<String>();
+    format!("{}…", shortened.trim_end())
+}
+
+fn extract_search_hits(html: &str) -> Vec<SearchHit> {
+    let mut hits = Vec::new();
+    let mut remaining = html;
+
+    while let Some(anchor_start) = remaining.find("result__a") {
+        let after_class = &remaining[anchor_start..];
+        let Some(href_idx) = after_class.find("href=") else {
+            remaining = &after_class[1..];
+            continue;
+        };
+        let href_slice = &after_class[href_idx + 5..];
+        let Some((url, rest)) = extract_quoted_value(href_slice) else {
+            remaining = &after_class[1..];
+            continue;
+        };
+        let Some(close_tag_idx) = rest.find('>') else {
+            remaining = &after_class[1..];
+            continue;
+        };
+        let after_tag = &rest[close_tag_idx + 1..];
+        let Some(end_anchor_idx) = after_tag.find("</a>") else {
+            remaining = &after_tag[1..];
+            continue;
+        };
+        let title = html_to_text(&after_tag[..end_anchor_idx]);
+        if let Some(decoded_url) = decode_duckduckgo_redirect(&url) {
+            hits.push(SearchHit {
+                title: title.trim().to_string(),
+                url: decoded_url,
+            });
+        }
+        remaining = &after_tag[end_anchor_idx + 4..];
+    }
+
+    hits
+}
+
+fn extract_search_hits_from_generic_links(html: &str) -> Vec<SearchHit> {
+    let mut hits = Vec::new();
+    let mut remaining = html;
+
+    while let Some(anchor_start) = remaining.find("<a") {
+        let after_anchor = &remaining[anchor_start..];
+        let Some(href_idx) = after_anchor.find("href=") else {
+            remaining = &after_anchor[2..];
+            continue;
+        };
+        let href_slice = &after_anchor[href_idx + 5..];
+        let Some((url, rest)) = extract_quoted_value(href_slice) else {
+            remaining = &after_anchor[2..];
+            continue;
+        };
+        let Some(close_tag_idx) = rest.find('>') else {
+            remaining = &after_anchor[2..];
+            continue;
+        };
+        let after_tag = &rest[close_tag_idx + 1..];
+        let Some(end_anchor_idx) = after_tag.find("</a>") else {
+            remaining = &after_anchor[2..];
+            continue;
+        };
+        let title = html_to_text(&after_tag[..end_anchor_idx]);
+        if title.trim().is_empty() {
+            remaining = &after_tag[end_anchor_idx + 4..];
+            continue;
+        }
+        let decoded_url = decode_duckduckgo_redirect(&url).unwrap_or(url);
+        if decoded_url.starts_with("http://") || decoded_url.starts_with("https://") {
+            hits.push(SearchHit {
+                title: title.trim().to_string(),
+                url: decoded_url,
+            });
+        }
+        remaining = &after_tag[end_anchor_idx + 4..];
+    }
+
+    hits
+}
+
+fn extract_quoted_value(input: &str) -> Option<(String, &str)> {
+    let quote = input.chars().next()?;
+    if quote != '"' && quote != '\'' {
+        return None;
+    }
+    let rest = &input[quote.len_utf8()..];
+    let end = rest.find(quote)?;
+    Some((rest[..end].to_string(), &rest[end + quote.len_utf8()..]))
+}
+
+fn decode_duckduckgo_redirect(url: &str) -> Option<String> {
+    if url.starts_with("http://") || url.starts_with("https://") {
+        return Some(html_entity_decode_url(url));
+    }
+
+    let joined = if url.starts_with("//") {
+        format!("https:{url}")
+    } else if url.starts_with('/') {
+        format!("https://duckduckgo.com{url}")
+    } else {
+        return None;
+    };
+
+    let parsed = reqwest::Url::parse(&joined).ok()?;
+    if parsed.path() == "/l/" || parsed.path() == "/l" {
+        for (key, value) in parsed.query_pairs() {
+            if key == "uddg" {
+                return Some(html_entity_decode_url(value.as_ref()));
+            }
+        }
+    }
+    Some(joined)
+}
+
+fn html_entity_decode_url(url: &str) -> String {
+    decode_html_entities(url)
+}
+
+fn host_matches_list(url: &str, domains: &[String]) -> bool {
+    let Ok(parsed) = reqwest::Url::parse(url) else {
+        return false;
+    };
+    let Some(host) = parsed.host_str() else {
+        return false;
+    };
+    domains.iter().any(|domain| {
+        let normalized = domain.trim().trim_start_matches('.');
+        host == normalized || host.ends_with(&format!(".{normalized}"))
+    })
+}
+
+fn dedupe_hits(hits: &mut Vec<SearchHit>) {
+    let mut seen = BTreeSet::new();
+    hits.retain(|hit| seen.insert(hit.url.clone()));
+}
+
 #[cfg(test)]
 mod tests {
+    use std::io::{Read, Write};
+    use std::net::{SocketAddr, TcpListener};
+    use std::sync::Arc;
+    use std::thread;
+    use std::time::Duration;
+
    use super::{execute_tool, mvp_tool_specs};
    use serde_json::json;

@@ -240,6 +691,8 @@ mod tests {
            .collect::<Vec<_>>();
        assert!(names.contains(&"bash"));
        assert!(names.contains(&"read_file"));
+        assert!(names.contains(&"WebFetch"));
+        assert!(names.contains(&"WebSearch"));
    }

    #[test]
@@ -247,4 +700,167 @@ mod tests {
        let error = execute_tool("nope", &json!({})).expect_err("tool should be rejected");
        assert!(error.contains("unsupported tool"));
    }
+
+    #[test]
+    fn web_fetch_returns_prompt_aware_summary() {
+        let server = TestServer::spawn(Arc::new(|request_line: &str| {
+            assert!(request_line.starts_with("GET /page "));
+            HttpResponse::html(
+                200,
+                "OK",
+                "<html><head><title>Ignored</title></head><body><h1>Test Page</h1><p>Hello <b>world</b> from local server.</p></body></html>",
+            )
+        }));
+
+        let result = execute_tool(
+            "WebFetch",
+            &json!({
+                "url": format!("http://{}/page", server.addr()),
+                "prompt": "Summarize this page"
+            }),
+        )
+        .expect("WebFetch should succeed");
+
+        let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
+        assert_eq!(output["code"], 200);
+        let summary = output["result"].as_str().expect("result string");
+        assert!(summary.contains("Fetched"));
+        assert!(summary.contains("Test Page"));
+        assert!(summary.contains("Hello world from local server"));
+    }
+
+    #[test]
+    fn web_search_extracts_and_filters_results() {
+        let server = TestServer::spawn(Arc::new(|request_line: &str| {
+            assert!(request_line.contains("GET /search?q=rust+web+search "));
+            HttpResponse::html(
+                200,
+                "OK",
+                r#"
+                <html><body>
+                  <a class="result__a" href="https://docs.rs/reqwest">Reqwest docs</a>
+                  <a class="result__a" href="https://example.com/blocked">Blocked result</a>
+                </body></html>
+                "#,
+            )
+        }));
+
+        std::env::set_var(
+            "CLAWD_WEB_SEARCH_BASE_URL",
+            format!("http://{}/search", server.addr()),
+        );
+        let result = execute_tool(
+            "WebSearch",
+            &json!({
+                "query": "rust web search",
+                "allowed_domains": ["docs.rs"],
+                "blocked_domains": ["example.com"]
+            }),
+        )
+        .expect("WebSearch should succeed");
+        std::env::remove_var("CLAWD_WEB_SEARCH_BASE_URL");
+
+        let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
+        assert_eq!(output["query"], "rust web search");
+        let results = output["results"].as_array().expect("results array");
+        let search_result = results
+            .iter()
+            .find(|item| item.get("content").is_some())
+            .expect("search result block present");
+        let content = search_result["content"].as_array().expect("content array");
+        assert_eq!(content.len(), 1);
+        assert_eq!(content[0]["title"], "Reqwest docs");
+        assert_eq!(content[0]["url"], "https://docs.rs/reqwest");
+    }
+
+    struct TestServer {
+        addr: SocketAddr,
+        shutdown: Option<std::sync::mpsc::Sender<()>>,
+        handle: Option<thread::JoinHandle<()>>,
+    }
+
+    impl TestServer {
+        fn spawn(handler: Arc<dyn Fn(&str) -> HttpResponse + Send + Sync + 'static>) -> Self {
+            let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
+            listener
+                .set_nonblocking(true)
+                .expect("set nonblocking listener");
+            let addr = listener.local_addr().expect("local addr");
+            let (tx, rx) = std::sync::mpsc::channel::<()>();
+
+            let handle = thread::spawn(move || loop {
+                if rx.try_recv().is_ok() {
+                    break;
+                }
+
+                match listener.accept() {
+                    Ok((mut stream, _)) => {
+                        let mut buffer = [0_u8; 4096];
+                        let size = stream.read(&mut buffer).expect("read request");
+                        let request = String::from_utf8_lossy(&buffer[..size]).into_owned();
+                        let request_line = request.lines().next().unwrap_or_default().to_string();
+                        let response = handler(&request_line);
+                        stream
+                            .write_all(response.to_bytes().as_slice())
+                            .expect("write response");
+                    }
+                    Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
+                        thread::sleep(Duration::from_millis(10));
+                    }
+                    Err(error) => panic!("server accept failed: {error}"),
+                }
+            });
+
+            Self {
+                addr,
+                shutdown: Some(tx),
+                handle: Some(handle),
+            }
+        }
+
+        fn addr(&self) -> SocketAddr {
+            self.addr
+        }
+    }
+
+    impl Drop for TestServer {
+        fn drop(&mut self) {
+            if let Some(tx) = self.shutdown.take() {
+                let _ = tx.send(());
+            }
+            if let Some(handle) = self.handle.take() {
+                handle.join().expect("join test server");
+            }
+        }
+    }
+
+    struct HttpResponse {
+        status: u16,
+        reason: &'static str,
+        content_type: &'static str,
+        body: String,
+    }
+
+    impl HttpResponse {
+        fn html(status: u16, reason: &'static str, body: &str) -> Self {
+            Self {
+                status,
+                reason,
+                content_type: "text/html; charset=utf-8",
+                body: body.to_string(),
+            }
+        }
+
+        fn to_bytes(&self) -> Vec<u8> {
+            format!(
+                "HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
+                self.status,
+                self.reason,
+                self.content_type,
+                self.body.len(),
+                self.body
+            )
+            .into_bytes()
+        }
+    }
 }