diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 308a108..8e7d88d 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -212,6 +212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -220,6 +221,18 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + [[package]] name = "futures-task" version = "0.3.32" @@ -233,7 +246,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-core", + "futures-io", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "slab", ] @@ -898,7 +914,9 @@ checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ "base64", "bytes", + "futures-channel", "futures-core", + "futures-util", "http", "http-body", "http-body-util", @@ -1352,6 +1370,7 @@ dependencies = [ name = "tools" version = "0.1.0" dependencies = [ + "reqwest", "runtime", "serde", "serde_json", diff --git a/rust/crates/tools/Cargo.toml b/rust/crates/tools/Cargo.toml index e1fb5bb..64768f4 100644 --- a/rust/crates/tools/Cargo.toml +++ b/rust/crates/tools/Cargo.toml @@ -7,6 +7,7 @@ publish.workspace = true [dependencies] runtime = { path = "../runtime" } +reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] } serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/rust/crates/tools/src/lib.rs b/rust/crates/tools/src/lib.rs index d8806b8..e6ab4e7 100644 --- a/rust/crates/tools/src/lib.rs +++ b/rust/crates/tools/src/lib.rs @@ -1,8 +1,12 @@ +use std::collections::BTreeSet; +use std::time::{Duration, Instant}; + +use reqwest::blocking::Client; use runtime::{ edit_file, execute_bash, glob_search, grep_search, read_file, write_file, BashCommandInput, GrepSearchInput, }; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -140,6 +144,40 @@ pub fn mvp_tool_specs() -> Vec { "additionalProperties": false }), }, + ToolSpec { + name: "WebFetch", + description: + "Fetch a URL, convert it into readable text, and answer a prompt about it.", + input_schema: json!({ + "type": "object", + "properties": { + "url": { "type": "string", "format": "uri" }, + "prompt": { "type": "string" } + }, + "required": ["url", "prompt"], + "additionalProperties": false + }), + }, + ToolSpec { + name: "WebSearch", + description: "Search the web for current information and return cited results.", + input_schema: json!({ + "type": "object", + "properties": { + "query": { "type": "string", "minLength": 2 }, + "allowed_domains": { + "type": "array", + "items": { "type": "string" } + }, + "blocked_domains": { + "type": "array", + "items": { "type": "string" } + } + }, + "required": ["query"], + "additionalProperties": false + }), + }, ] } @@ -151,6 +189,8 @@ pub fn execute_tool(name: &str, input: &Value) -> Result { "edit_file" => from_value::(input).and_then(run_edit_file), "glob_search" => from_value::(input).and_then(run_glob_search), "grep_search" => from_value::(input).and_then(run_grep_search), + "WebFetch" => from_value::(input).and_then(run_web_fetch), + "WebSearch" => from_value::(input).and_then(run_web_search), _ => Err(format!("unsupported tool: {name}")), } } @@ -192,6 +232,14 @@ fn run_grep_search(input: GrepSearchInput) -> Result { to_pretty_json(grep_search(&input).map_err(io_to_string)?) } +fn run_web_fetch(input: WebFetchInput) -> Result { + to_pretty_json(execute_web_fetch(&input)?) +} + +fn run_web_search(input: WebSearchInput) -> Result { + to_pretty_json(execute_web_search(&input)?) +} + fn to_pretty_json(value: T) -> Result { serde_json::to_string_pretty(&value).map_err(|error| error.to_string()) } @@ -227,8 +275,411 @@ struct GlobSearchInputValue { path: Option, } +#[derive(Debug, Deserialize)] +struct WebFetchInput { + url: String, + prompt: String, +} + +#[derive(Debug, Deserialize)] +struct WebSearchInput { + query: String, + allowed_domains: Option>, + blocked_domains: Option>, +} + +#[derive(Debug, Serialize)] +struct WebFetchOutput { + bytes: usize, + code: u16, + #[serde(rename = "codeText")] + code_text: String, + result: String, + #[serde(rename = "durationMs")] + duration_ms: u128, + url: String, +} + +#[derive(Debug, Serialize)] +struct WebSearchOutput { + query: String, + results: Vec, + #[serde(rename = "durationSeconds")] + duration_seconds: f64, +} + +#[derive(Debug, Serialize)] +#[serde(untagged)] +enum WebSearchResultItem { + SearchResult { + tool_use_id: String, + content: Vec, + }, + Commentary(String), +} + +#[derive(Debug, Serialize)] +struct SearchHit { + title: String, + url: String, +} + +fn execute_web_fetch(input: &WebFetchInput) -> Result { + let started = Instant::now(); + let client = build_http_client()?; + let request_url = normalize_fetch_url(&input.url)?; + let response = client + .get(request_url.clone()) + .send() + .map_err(|error| error.to_string())?; + + let status = response.status(); + let final_url = response.url().to_string(); + let code = status.as_u16(); + let code_text = status.canonical_reason().unwrap_or("Unknown").to_string(); + let content_type = response + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .unwrap_or_default() + .to_string(); + let body = response.text().map_err(|error| error.to_string())?; + let bytes = body.len(); + let normalized = normalize_fetched_content(&body, &content_type); + let result = summarize_web_fetch(&final_url, &input.prompt, &normalized); + + Ok(WebFetchOutput { + bytes, + code, + code_text, + result, + duration_ms: started.elapsed().as_millis(), + url: final_url, + }) +} + +fn execute_web_search(input: &WebSearchInput) -> Result { + let started = Instant::now(); + let client = build_http_client()?; + let search_url = build_search_url(&input.query)?; + let response = client + .get(search_url) + .send() + .map_err(|error| error.to_string())?; + + let final_url = response.url().clone(); + let html = response.text().map_err(|error| error.to_string())?; + let mut hits = extract_search_hits(&html); + + if hits.is_empty() && final_url.host_str().is_some() { + hits = extract_search_hits_from_generic_links(&html); + } + + if let Some(allowed) = input.allowed_domains.as_ref() { + hits.retain(|hit| host_matches_list(&hit.url, allowed)); + } + if let Some(blocked) = input.blocked_domains.as_ref() { + hits.retain(|hit| !host_matches_list(&hit.url, blocked)); + } + + dedupe_hits(&mut hits); + hits.truncate(8); + + let summary = if hits.is_empty() { + format!("No web search results matched the query {:?}.", input.query) + } else { + let rendered_hits = hits + .iter() + .map(|hit| format!("- [{}]({})", hit.title, hit.url)) + .collect::>() + .join("\n"); + format!( + "Search results for {:?}. Include a Sources section in the final answer.\n{}", + input.query, rendered_hits + ) + }; + + Ok(WebSearchOutput { + query: input.query.clone(), + results: vec![ + WebSearchResultItem::Commentary(summary), + WebSearchResultItem::SearchResult { + tool_use_id: String::from("web_search_1"), + content: hits, + }, + ], + duration_seconds: started.elapsed().as_secs_f64(), + }) +} + +fn build_http_client() -> Result { + Client::builder() + .timeout(Duration::from_secs(20)) + .redirect(reqwest::redirect::Policy::limited(10)) + .user_agent("clawd-rust-tools/0.1") + .build() + .map_err(|error| error.to_string()) +} + +fn normalize_fetch_url(url: &str) -> Result { + let parsed = reqwest::Url::parse(url).map_err(|error| error.to_string())?; + if parsed.scheme() == "http" { + let host = parsed.host_str().unwrap_or_default(); + if host != "localhost" && host != "127.0.0.1" && host != "::1" { + let mut upgraded = parsed; + upgraded + .set_scheme("https") + .map_err(|_| String::from("failed to upgrade URL to https"))?; + return Ok(upgraded.to_string()); + } + } + Ok(parsed.to_string()) +} + +fn build_search_url(query: &str) -> Result { + if let Ok(base) = std::env::var("CLAWD_WEB_SEARCH_BASE_URL") { + let mut url = reqwest::Url::parse(&base).map_err(|error| error.to_string())?; + url.query_pairs_mut().append_pair("q", query); + return Ok(url); + } + + let mut url = reqwest::Url::parse("https://html.duckduckgo.com/html/") + .map_err(|error| error.to_string())?; + url.query_pairs_mut().append_pair("q", query); + Ok(url) +} + +fn normalize_fetched_content(body: &str, content_type: &str) -> String { + if content_type.contains("html") { + html_to_text(body) + } else { + body.trim().to_string() + } +} + +fn summarize_web_fetch(url: &str, prompt: &str, content: &str) -> String { + let lower_prompt = prompt.to_lowercase(); + let compact = collapse_whitespace(content); + + let detail = if lower_prompt.contains("title") { + extract_title(content) + .map(|title| format!("Title: {title}")) + .unwrap_or_else(|| preview_text(&compact, 600)) + } else if lower_prompt.contains("summary") || lower_prompt.contains("summarize") { + preview_text(&compact, 900) + } else { + let preview = preview_text(&compact, 900); + format!("Prompt: {prompt}\nContent preview:\n{preview}") + }; + + format!("Fetched {url}\n{detail}") +} + +fn extract_title(content: &str) -> Option { + for line in content.lines() { + let trimmed = line.trim(); + if !trimmed.is_empty() { + return Some(trimmed.to_string()); + } + } + None +} + +fn html_to_text(html: &str) -> String { + let mut text = String::with_capacity(html.len()); + let mut in_tag = false; + let mut previous_was_space = false; + + for ch in html.chars() { + match ch { + '<' => in_tag = true, + '>' => in_tag = false, + _ if in_tag => {} + '&' => { + text.push('&'); + previous_was_space = false; + } + ch if ch.is_whitespace() => { + if !previous_was_space { + text.push(' '); + previous_was_space = true; + } + } + _ => { + text.push(ch); + previous_was_space = false; + } + } + } + + collapse_whitespace(&decode_html_entities(&text)) +} + +fn decode_html_entities(input: &str) -> String { + input + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") + .replace(" ", " ") +} + +fn collapse_whitespace(input: &str) -> String { + input.split_whitespace().collect::>().join(" ") +} + +fn preview_text(input: &str, max_chars: usize) -> String { + if input.chars().count() <= max_chars { + return input.to_string(); + } + let shortened = input.chars().take(max_chars).collect::(); + format!("{}…", shortened.trim_end()) +} + +fn extract_search_hits(html: &str) -> Vec { + let mut hits = Vec::new(); + let mut remaining = html; + + while let Some(anchor_start) = remaining.find("result__a") { + let after_class = &remaining[anchor_start..]; + let Some(href_idx) = after_class.find("href=") else { + remaining = &after_class[1..]; + continue; + }; + let href_slice = &after_class[href_idx + 5..]; + let Some((url, rest)) = extract_quoted_value(href_slice) else { + remaining = &after_class[1..]; + continue; + }; + let Some(close_tag_idx) = rest.find('>') else { + remaining = &after_class[1..]; + continue; + }; + let after_tag = &rest[close_tag_idx + 1..]; + let Some(end_anchor_idx) = after_tag.find("") else { + remaining = &after_tag[1..]; + continue; + }; + let title = html_to_text(&after_tag[..end_anchor_idx]); + if let Some(decoded_url) = decode_duckduckgo_redirect(&url) { + hits.push(SearchHit { + title: title.trim().to_string(), + url: decoded_url, + }); + } + remaining = &after_tag[end_anchor_idx + 4..]; + } + + hits +} + +fn extract_search_hits_from_generic_links(html: &str) -> Vec { + let mut hits = Vec::new(); + let mut remaining = html; + + while let Some(anchor_start) = remaining.find("') else { + remaining = &after_anchor[2..]; + continue; + }; + let after_tag = &rest[close_tag_idx + 1..]; + let Some(end_anchor_idx) = after_tag.find("") else { + remaining = &after_anchor[2..]; + continue; + }; + let title = html_to_text(&after_tag[..end_anchor_idx]); + if title.trim().is_empty() { + remaining = &after_tag[end_anchor_idx + 4..]; + continue; + } + let decoded_url = decode_duckduckgo_redirect(&url).unwrap_or(url); + if decoded_url.starts_with("http://") || decoded_url.starts_with("https://") { + hits.push(SearchHit { + title: title.trim().to_string(), + url: decoded_url, + }); + } + remaining = &after_tag[end_anchor_idx + 4..]; + } + + hits +} + +fn extract_quoted_value(input: &str) -> Option<(String, &str)> { + let quote = input.chars().next()?; + if quote != '"' && quote != '\'' { + return None; + } + let rest = &input[quote.len_utf8()..]; + let end = rest.find(quote)?; + Some((rest[..end].to_string(), &rest[end + quote.len_utf8()..])) +} + +fn decode_duckduckgo_redirect(url: &str) -> Option { + if url.starts_with("http://") || url.starts_with("https://") { + return Some(html_entity_decode_url(url)); + } + + let joined = if url.starts_with("//") { + format!("https:{url}") + } else if url.starts_with('/') { + format!("https://duckduckgo.com{url}") + } else { + return None; + }; + + let parsed = reqwest::Url::parse(&joined).ok()?; + if parsed.path() == "/l/" || parsed.path() == "/l" { + for (key, value) in parsed.query_pairs() { + if key == "uddg" { + return Some(html_entity_decode_url(value.as_ref())); + } + } + } + Some(joined) +} + +fn html_entity_decode_url(url: &str) -> String { + decode_html_entities(url) +} + +fn host_matches_list(url: &str, domains: &[String]) -> bool { + let Ok(parsed) = reqwest::Url::parse(url) else { + return false; + }; + let Some(host) = parsed.host_str() else { + return false; + }; + domains.iter().any(|domain| { + let normalized = domain.trim().trim_start_matches('.'); + host == normalized || host.ends_with(&format!(".{normalized}")) + }) +} + +fn dedupe_hits(hits: &mut Vec) { + let mut seen = BTreeSet::new(); + hits.retain(|hit| seen.insert(hit.url.clone())); +} + #[cfg(test)] mod tests { + use std::io::{Read, Write}; + use std::net::{SocketAddr, TcpListener}; + use std::sync::Arc; + use std::thread; + use std::time::Duration; + use super::{execute_tool, mvp_tool_specs}; use serde_json::json; @@ -240,6 +691,8 @@ mod tests { .collect::>(); assert!(names.contains(&"bash")); assert!(names.contains(&"read_file")); + assert!(names.contains(&"WebFetch")); + assert!(names.contains(&"WebSearch")); } #[test] @@ -247,4 +700,167 @@ mod tests { let error = execute_tool("nope", &json!({})).expect_err("tool should be rejected"); assert!(error.contains("unsupported tool")); } + + #[test] + fn web_fetch_returns_prompt_aware_summary() { + let server = TestServer::spawn(Arc::new(|request_line: &str| { + assert!(request_line.starts_with("GET /page ")); + HttpResponse::html( + 200, + "OK", + "Ignored

Test Page

Hello world from local server.

", + ) + })); + + let result = execute_tool( + "WebFetch", + &json!({ + "url": format!("http://{}/page", server.addr()), + "prompt": "Summarize this page" + }), + ) + .expect("WebFetch should succeed"); + + let output: serde_json::Value = serde_json::from_str(&result).expect("valid json"); + assert_eq!(output["code"], 200); + let summary = output["result"].as_str().expect("result string"); + assert!(summary.contains("Fetched")); + assert!(summary.contains("Test Page")); + assert!(summary.contains("Hello world from local server")); + } + + #[test] + fn web_search_extracts_and_filters_results() { + let server = TestServer::spawn(Arc::new(|request_line: &str| { + assert!(request_line.contains("GET /search?q=rust+web+search ")); + HttpResponse::html( + 200, + "OK", + r#" + + Reqwest docs + Blocked result + + "#, + ) + })); + + std::env::set_var( + "CLAWD_WEB_SEARCH_BASE_URL", + format!("http://{}/search", server.addr()), + ); + let result = execute_tool( + "WebSearch", + &json!({ + "query": "rust web search", + "allowed_domains": ["docs.rs"], + "blocked_domains": ["example.com"] + }), + ) + .expect("WebSearch should succeed"); + std::env::remove_var("CLAWD_WEB_SEARCH_BASE_URL"); + + let output: serde_json::Value = serde_json::from_str(&result).expect("valid json"); + assert_eq!(output["query"], "rust web search"); + let results = output["results"].as_array().expect("results array"); + let search_result = results + .iter() + .find(|item| item.get("content").is_some()) + .expect("search result block present"); + let content = search_result["content"].as_array().expect("content array"); + assert_eq!(content.len(), 1); + assert_eq!(content[0]["title"], "Reqwest docs"); + assert_eq!(content[0]["url"], "https://docs.rs/reqwest"); + } + + struct TestServer { + addr: SocketAddr, + shutdown: Option>, + handle: Option>, + } + + impl TestServer { + fn spawn(handler: Arc HttpResponse + Send + Sync + 'static>) -> Self { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + listener + .set_nonblocking(true) + .expect("set nonblocking listener"); + let addr = listener.local_addr().expect("local addr"); + let (tx, rx) = std::sync::mpsc::channel::<()>(); + + let handle = thread::spawn(move || loop { + if rx.try_recv().is_ok() { + break; + } + + match listener.accept() { + Ok((mut stream, _)) => { + let mut buffer = [0_u8; 4096]; + let size = stream.read(&mut buffer).expect("read request"); + let request = String::from_utf8_lossy(&buffer[..size]).into_owned(); + let request_line = request.lines().next().unwrap_or_default().to_string(); + let response = handler(&request_line); + stream + .write_all(response.to_bytes().as_slice()) + .expect("write response"); + } + Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_millis(10)); + } + Err(error) => panic!("server accept failed: {error}"), + } + }); + + Self { + addr, + shutdown: Some(tx), + handle: Some(handle), + } + } + + fn addr(&self) -> SocketAddr { + self.addr + } + } + + impl Drop for TestServer { + fn drop(&mut self) { + if let Some(tx) = self.shutdown.take() { + let _ = tx.send(()); + } + if let Some(handle) = self.handle.take() { + handle.join().expect("join test server"); + } + } + } + + struct HttpResponse { + status: u16, + reason: &'static str, + content_type: &'static str, + body: String, + } + + impl HttpResponse { + fn html(status: u16, reason: &'static str, body: &str) -> Self { + Self { + status, + reason, + content_type: "text/html; charset=utf-8", + body: body.to_string(), + } + } + + fn to_bytes(&self) -> Vec { + format!( + "HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}", + self.status, + self.reason, + self.content_type, + self.body.len(), + self.body + ) + .into_bytes() + } + } }