mirror of
https://github.com/lWolvesl/claw-code.git
synced 2026-04-02 22:01:51 +08:00
feat(tools): add WebFetch and WebSearch parity primitives
Implement the first web-oriented Claude Code parity slice in the Rust tools crate. This adds concrete WebFetch and WebSearch tool specs, execution paths, lightweight HTML/search-result extraction, domain filtering, and local HTTP-backed tests while leaving the existing core file and shell tools intact.\n\nConstraint: Keep the change scoped to tools-only Rust workspace code\nConstraint: Match Claude Code tool names and JSON schemas closely enough for parity work\nRejected: Stub-only tool registrations | would not materially expand beyond MVP\nRejected: Full browser/search service integration | too large for this first logical slice\nConfidence: medium\nScope-risk: moderate\nReversibility: clean\nDirective: Treat these web helpers as a parity foundation; refine result quality without renaming the exposed tool contracts\nTested: cargo fmt; cargo test -p tools\nNot-tested: cargo clippy; full workspace cargo test
This commit is contained in:
19
rust/Cargo.lock
generated
19
rust/Cargo.lock
generated
@@ -212,6 +212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
|
checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures-core",
|
"futures-core",
|
||||||
|
"futures-sink",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -220,6 +221,18 @@ version = "0.3.32"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
|
checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "futures-io"
|
||||||
|
version = "0.3.32"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "futures-sink"
|
||||||
|
version = "0.3.32"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "futures-task"
|
name = "futures-task"
|
||||||
version = "0.3.32"
|
version = "0.3.32"
|
||||||
@@ -233,7 +246,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
|
checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures-core",
|
"futures-core",
|
||||||
|
"futures-io",
|
||||||
|
"futures-sink",
|
||||||
"futures-task",
|
"futures-task",
|
||||||
|
"memchr",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"slab",
|
"slab",
|
||||||
]
|
]
|
||||||
@@ -898,7 +914,9 @@ checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"base64",
|
"base64",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
"futures-channel",
|
||||||
"futures-core",
|
"futures-core",
|
||||||
|
"futures-util",
|
||||||
"http",
|
"http",
|
||||||
"http-body",
|
"http-body",
|
||||||
"http-body-util",
|
"http-body-util",
|
||||||
@@ -1352,6 +1370,7 @@ dependencies = [
|
|||||||
name = "tools"
|
name = "tools"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"reqwest",
|
||||||
"runtime",
|
"runtime",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ publish.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
runtime = { path = "../runtime" }
|
runtime = { path = "../runtime" }
|
||||||
|
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use reqwest::blocking::Client;
|
||||||
use runtime::{
|
use runtime::{
|
||||||
edit_file, execute_bash, glob_search, grep_search, read_file, write_file, BashCommandInput,
|
edit_file, execute_bash, glob_search, grep_search, read_file, write_file, BashCommandInput,
|
||||||
GrepSearchInput,
|
GrepSearchInput,
|
||||||
};
|
};
|
||||||
use serde::Deserialize;
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
@@ -140,6 +144,40 @@ pub fn mvp_tool_specs() -> Vec<ToolSpec> {
|
|||||||
"additionalProperties": false
|
"additionalProperties": false
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
|
ToolSpec {
|
||||||
|
name: "WebFetch",
|
||||||
|
description:
|
||||||
|
"Fetch a URL, convert it into readable text, and answer a prompt about it.",
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": { "type": "string", "format": "uri" },
|
||||||
|
"prompt": { "type": "string" }
|
||||||
|
},
|
||||||
|
"required": ["url", "prompt"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
ToolSpec {
|
||||||
|
name: "WebSearch",
|
||||||
|
description: "Search the web for current information and return cited results.",
|
||||||
|
input_schema: json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": { "type": "string", "minLength": 2 },
|
||||||
|
"allowed_domains": {
|
||||||
|
"type": "array",
|
||||||
|
"items": { "type": "string" }
|
||||||
|
},
|
||||||
|
"blocked_domains": {
|
||||||
|
"type": "array",
|
||||||
|
"items": { "type": "string" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}),
|
||||||
|
},
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,6 +189,8 @@ pub fn execute_tool(name: &str, input: &Value) -> Result<String, String> {
|
|||||||
"edit_file" => from_value::<EditFileInput>(input).and_then(run_edit_file),
|
"edit_file" => from_value::<EditFileInput>(input).and_then(run_edit_file),
|
||||||
"glob_search" => from_value::<GlobSearchInputValue>(input).and_then(run_glob_search),
|
"glob_search" => from_value::<GlobSearchInputValue>(input).and_then(run_glob_search),
|
||||||
"grep_search" => from_value::<GrepSearchInput>(input).and_then(run_grep_search),
|
"grep_search" => from_value::<GrepSearchInput>(input).and_then(run_grep_search),
|
||||||
|
"WebFetch" => from_value::<WebFetchInput>(input).and_then(run_web_fetch),
|
||||||
|
"WebSearch" => from_value::<WebSearchInput>(input).and_then(run_web_search),
|
||||||
_ => Err(format!("unsupported tool: {name}")),
|
_ => Err(format!("unsupported tool: {name}")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -192,6 +232,14 @@ fn run_grep_search(input: GrepSearchInput) -> Result<String, String> {
|
|||||||
to_pretty_json(grep_search(&input).map_err(io_to_string)?)
|
to_pretty_json(grep_search(&input).map_err(io_to_string)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn run_web_fetch(input: WebFetchInput) -> Result<String, String> {
|
||||||
|
to_pretty_json(execute_web_fetch(&input)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_web_search(input: WebSearchInput) -> Result<String, String> {
|
||||||
|
to_pretty_json(execute_web_search(&input)?)
|
||||||
|
}
|
||||||
|
|
||||||
fn to_pretty_json<T: serde::Serialize>(value: T) -> Result<String, String> {
|
fn to_pretty_json<T: serde::Serialize>(value: T) -> Result<String, String> {
|
||||||
serde_json::to_string_pretty(&value).map_err(|error| error.to_string())
|
serde_json::to_string_pretty(&value).map_err(|error| error.to_string())
|
||||||
}
|
}
|
||||||
@@ -227,8 +275,411 @@ struct GlobSearchInputValue {
|
|||||||
path: Option<String>,
|
path: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct WebFetchInput {
|
||||||
|
url: String,
|
||||||
|
prompt: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct WebSearchInput {
|
||||||
|
query: String,
|
||||||
|
allowed_domains: Option<Vec<String>>,
|
||||||
|
blocked_domains: Option<Vec<String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct WebFetchOutput {
|
||||||
|
bytes: usize,
|
||||||
|
code: u16,
|
||||||
|
#[serde(rename = "codeText")]
|
||||||
|
code_text: String,
|
||||||
|
result: String,
|
||||||
|
#[serde(rename = "durationMs")]
|
||||||
|
duration_ms: u128,
|
||||||
|
url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct WebSearchOutput {
|
||||||
|
query: String,
|
||||||
|
results: Vec<WebSearchResultItem>,
|
||||||
|
#[serde(rename = "durationSeconds")]
|
||||||
|
duration_seconds: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
enum WebSearchResultItem {
|
||||||
|
SearchResult {
|
||||||
|
tool_use_id: String,
|
||||||
|
content: Vec<SearchHit>,
|
||||||
|
},
|
||||||
|
Commentary(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct SearchHit {
|
||||||
|
title: String,
|
||||||
|
url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_web_fetch(input: &WebFetchInput) -> Result<WebFetchOutput, String> {
|
||||||
|
let started = Instant::now();
|
||||||
|
let client = build_http_client()?;
|
||||||
|
let request_url = normalize_fetch_url(&input.url)?;
|
||||||
|
let response = client
|
||||||
|
.get(request_url.clone())
|
||||||
|
.send()
|
||||||
|
.map_err(|error| error.to_string())?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
let final_url = response.url().to_string();
|
||||||
|
let code = status.as_u16();
|
||||||
|
let code_text = status.canonical_reason().unwrap_or("Unknown").to_string();
|
||||||
|
let content_type = response
|
||||||
|
.headers()
|
||||||
|
.get(reqwest::header::CONTENT_TYPE)
|
||||||
|
.and_then(|value| value.to_str().ok())
|
||||||
|
.unwrap_or_default()
|
||||||
|
.to_string();
|
||||||
|
let body = response.text().map_err(|error| error.to_string())?;
|
||||||
|
let bytes = body.len();
|
||||||
|
let normalized = normalize_fetched_content(&body, &content_type);
|
||||||
|
let result = summarize_web_fetch(&final_url, &input.prompt, &normalized);
|
||||||
|
|
||||||
|
Ok(WebFetchOutput {
|
||||||
|
bytes,
|
||||||
|
code,
|
||||||
|
code_text,
|
||||||
|
result,
|
||||||
|
duration_ms: started.elapsed().as_millis(),
|
||||||
|
url: final_url,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_web_search(input: &WebSearchInput) -> Result<WebSearchOutput, String> {
|
||||||
|
let started = Instant::now();
|
||||||
|
let client = build_http_client()?;
|
||||||
|
let search_url = build_search_url(&input.query)?;
|
||||||
|
let response = client
|
||||||
|
.get(search_url)
|
||||||
|
.send()
|
||||||
|
.map_err(|error| error.to_string())?;
|
||||||
|
|
||||||
|
let final_url = response.url().clone();
|
||||||
|
let html = response.text().map_err(|error| error.to_string())?;
|
||||||
|
let mut hits = extract_search_hits(&html);
|
||||||
|
|
||||||
|
if hits.is_empty() && final_url.host_str().is_some() {
|
||||||
|
hits = extract_search_hits_from_generic_links(&html);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(allowed) = input.allowed_domains.as_ref() {
|
||||||
|
hits.retain(|hit| host_matches_list(&hit.url, allowed));
|
||||||
|
}
|
||||||
|
if let Some(blocked) = input.blocked_domains.as_ref() {
|
||||||
|
hits.retain(|hit| !host_matches_list(&hit.url, blocked));
|
||||||
|
}
|
||||||
|
|
||||||
|
dedupe_hits(&mut hits);
|
||||||
|
hits.truncate(8);
|
||||||
|
|
||||||
|
let summary = if hits.is_empty() {
|
||||||
|
format!("No web search results matched the query {:?}.", input.query)
|
||||||
|
} else {
|
||||||
|
let rendered_hits = hits
|
||||||
|
.iter()
|
||||||
|
.map(|hit| format!("- [{}]({})", hit.title, hit.url))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("\n");
|
||||||
|
format!(
|
||||||
|
"Search results for {:?}. Include a Sources section in the final answer.\n{}",
|
||||||
|
input.query, rendered_hits
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(WebSearchOutput {
|
||||||
|
query: input.query.clone(),
|
||||||
|
results: vec![
|
||||||
|
WebSearchResultItem::Commentary(summary),
|
||||||
|
WebSearchResultItem::SearchResult {
|
||||||
|
tool_use_id: String::from("web_search_1"),
|
||||||
|
content: hits,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
duration_seconds: started.elapsed().as_secs_f64(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_http_client() -> Result<Client, String> {
|
||||||
|
Client::builder()
|
||||||
|
.timeout(Duration::from_secs(20))
|
||||||
|
.redirect(reqwest::redirect::Policy::limited(10))
|
||||||
|
.user_agent("clawd-rust-tools/0.1")
|
||||||
|
.build()
|
||||||
|
.map_err(|error| error.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn normalize_fetch_url(url: &str) -> Result<String, String> {
|
||||||
|
let parsed = reqwest::Url::parse(url).map_err(|error| error.to_string())?;
|
||||||
|
if parsed.scheme() == "http" {
|
||||||
|
let host = parsed.host_str().unwrap_or_default();
|
||||||
|
if host != "localhost" && host != "127.0.0.1" && host != "::1" {
|
||||||
|
let mut upgraded = parsed;
|
||||||
|
upgraded
|
||||||
|
.set_scheme("https")
|
||||||
|
.map_err(|_| String::from("failed to upgrade URL to https"))?;
|
||||||
|
return Ok(upgraded.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(parsed.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_search_url(query: &str) -> Result<reqwest::Url, String> {
|
||||||
|
if let Ok(base) = std::env::var("CLAWD_WEB_SEARCH_BASE_URL") {
|
||||||
|
let mut url = reqwest::Url::parse(&base).map_err(|error| error.to_string())?;
|
||||||
|
url.query_pairs_mut().append_pair("q", query);
|
||||||
|
return Ok(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut url = reqwest::Url::parse("https://html.duckduckgo.com/html/")
|
||||||
|
.map_err(|error| error.to_string())?;
|
||||||
|
url.query_pairs_mut().append_pair("q", query);
|
||||||
|
Ok(url)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn normalize_fetched_content(body: &str, content_type: &str) -> String {
|
||||||
|
if content_type.contains("html") {
|
||||||
|
html_to_text(body)
|
||||||
|
} else {
|
||||||
|
body.trim().to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn summarize_web_fetch(url: &str, prompt: &str, content: &str) -> String {
|
||||||
|
let lower_prompt = prompt.to_lowercase();
|
||||||
|
let compact = collapse_whitespace(content);
|
||||||
|
|
||||||
|
let detail = if lower_prompt.contains("title") {
|
||||||
|
extract_title(content)
|
||||||
|
.map(|title| format!("Title: {title}"))
|
||||||
|
.unwrap_or_else(|| preview_text(&compact, 600))
|
||||||
|
} else if lower_prompt.contains("summary") || lower_prompt.contains("summarize") {
|
||||||
|
preview_text(&compact, 900)
|
||||||
|
} else {
|
||||||
|
let preview = preview_text(&compact, 900);
|
||||||
|
format!("Prompt: {prompt}\nContent preview:\n{preview}")
|
||||||
|
};
|
||||||
|
|
||||||
|
format!("Fetched {url}\n{detail}")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_title(content: &str) -> Option<String> {
|
||||||
|
for line in content.lines() {
|
||||||
|
let trimmed = line.trim();
|
||||||
|
if !trimmed.is_empty() {
|
||||||
|
return Some(trimmed.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn html_to_text(html: &str) -> String {
|
||||||
|
let mut text = String::with_capacity(html.len());
|
||||||
|
let mut in_tag = false;
|
||||||
|
let mut previous_was_space = false;
|
||||||
|
|
||||||
|
for ch in html.chars() {
|
||||||
|
match ch {
|
||||||
|
'<' => in_tag = true,
|
||||||
|
'>' => in_tag = false,
|
||||||
|
_ if in_tag => {}
|
||||||
|
'&' => {
|
||||||
|
text.push('&');
|
||||||
|
previous_was_space = false;
|
||||||
|
}
|
||||||
|
ch if ch.is_whitespace() => {
|
||||||
|
if !previous_was_space {
|
||||||
|
text.push(' ');
|
||||||
|
previous_was_space = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
text.push(ch);
|
||||||
|
previous_was_space = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
collapse_whitespace(&decode_html_entities(&text))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn decode_html_entities(input: &str) -> String {
|
||||||
|
input
|
||||||
|
.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace(""", "\"")
|
||||||
|
.replace("'", "'")
|
||||||
|
.replace(" ", " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collapse_whitespace(input: &str) -> String {
|
||||||
|
input.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn preview_text(input: &str, max_chars: usize) -> String {
|
||||||
|
if input.chars().count() <= max_chars {
|
||||||
|
return input.to_string();
|
||||||
|
}
|
||||||
|
let shortened = input.chars().take(max_chars).collect::<String>();
|
||||||
|
format!("{}…", shortened.trim_end())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_search_hits(html: &str) -> Vec<SearchHit> {
|
||||||
|
let mut hits = Vec::new();
|
||||||
|
let mut remaining = html;
|
||||||
|
|
||||||
|
while let Some(anchor_start) = remaining.find("result__a") {
|
||||||
|
let after_class = &remaining[anchor_start..];
|
||||||
|
let Some(href_idx) = after_class.find("href=") else {
|
||||||
|
remaining = &after_class[1..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let href_slice = &after_class[href_idx + 5..];
|
||||||
|
let Some((url, rest)) = extract_quoted_value(href_slice) else {
|
||||||
|
remaining = &after_class[1..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let Some(close_tag_idx) = rest.find('>') else {
|
||||||
|
remaining = &after_class[1..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let after_tag = &rest[close_tag_idx + 1..];
|
||||||
|
let Some(end_anchor_idx) = after_tag.find("</a>") else {
|
||||||
|
remaining = &after_tag[1..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let title = html_to_text(&after_tag[..end_anchor_idx]);
|
||||||
|
if let Some(decoded_url) = decode_duckduckgo_redirect(&url) {
|
||||||
|
hits.push(SearchHit {
|
||||||
|
title: title.trim().to_string(),
|
||||||
|
url: decoded_url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
remaining = &after_tag[end_anchor_idx + 4..];
|
||||||
|
}
|
||||||
|
|
||||||
|
hits
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_search_hits_from_generic_links(html: &str) -> Vec<SearchHit> {
|
||||||
|
let mut hits = Vec::new();
|
||||||
|
let mut remaining = html;
|
||||||
|
|
||||||
|
while let Some(anchor_start) = remaining.find("<a") {
|
||||||
|
let after_anchor = &remaining[anchor_start..];
|
||||||
|
let Some(href_idx) = after_anchor.find("href=") else {
|
||||||
|
remaining = &after_anchor[2..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let href_slice = &after_anchor[href_idx + 5..];
|
||||||
|
let Some((url, rest)) = extract_quoted_value(href_slice) else {
|
||||||
|
remaining = &after_anchor[2..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let Some(close_tag_idx) = rest.find('>') else {
|
||||||
|
remaining = &after_anchor[2..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let after_tag = &rest[close_tag_idx + 1..];
|
||||||
|
let Some(end_anchor_idx) = after_tag.find("</a>") else {
|
||||||
|
remaining = &after_anchor[2..];
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let title = html_to_text(&after_tag[..end_anchor_idx]);
|
||||||
|
if title.trim().is_empty() {
|
||||||
|
remaining = &after_tag[end_anchor_idx + 4..];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let decoded_url = decode_duckduckgo_redirect(&url).unwrap_or(url);
|
||||||
|
if decoded_url.starts_with("http://") || decoded_url.starts_with("https://") {
|
||||||
|
hits.push(SearchHit {
|
||||||
|
title: title.trim().to_string(),
|
||||||
|
url: decoded_url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
remaining = &after_tag[end_anchor_idx + 4..];
|
||||||
|
}
|
||||||
|
|
||||||
|
hits
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_quoted_value(input: &str) -> Option<(String, &str)> {
|
||||||
|
let quote = input.chars().next()?;
|
||||||
|
if quote != '"' && quote != '\'' {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let rest = &input[quote.len_utf8()..];
|
||||||
|
let end = rest.find(quote)?;
|
||||||
|
Some((rest[..end].to_string(), &rest[end + quote.len_utf8()..]))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn decode_duckduckgo_redirect(url: &str) -> Option<String> {
|
||||||
|
if url.starts_with("http://") || url.starts_with("https://") {
|
||||||
|
return Some(html_entity_decode_url(url));
|
||||||
|
}
|
||||||
|
|
||||||
|
let joined = if url.starts_with("//") {
|
||||||
|
format!("https:{url}")
|
||||||
|
} else if url.starts_with('/') {
|
||||||
|
format!("https://duckduckgo.com{url}")
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
|
||||||
|
let parsed = reqwest::Url::parse(&joined).ok()?;
|
||||||
|
if parsed.path() == "/l/" || parsed.path() == "/l" {
|
||||||
|
for (key, value) in parsed.query_pairs() {
|
||||||
|
if key == "uddg" {
|
||||||
|
return Some(html_entity_decode_url(value.as_ref()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(joined)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn html_entity_decode_url(url: &str) -> String {
|
||||||
|
decode_html_entities(url)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn host_matches_list(url: &str, domains: &[String]) -> bool {
|
||||||
|
let Ok(parsed) = reqwest::Url::parse(url) else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
let Some(host) = parsed.host_str() else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
domains.iter().any(|domain| {
|
||||||
|
let normalized = domain.trim().trim_start_matches('.');
|
||||||
|
host == normalized || host.ends_with(&format!(".{normalized}"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dedupe_hits(hits: &mut Vec<SearchHit>) {
|
||||||
|
let mut seen = BTreeSet::new();
|
||||||
|
hits.retain(|hit| seen.insert(hit.url.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::io::{Read, Write};
|
||||||
|
use std::net::{SocketAddr, TcpListener};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use super::{execute_tool, mvp_tool_specs};
|
use super::{execute_tool, mvp_tool_specs};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
@@ -240,6 +691,8 @@ mod tests {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
assert!(names.contains(&"bash"));
|
assert!(names.contains(&"bash"));
|
||||||
assert!(names.contains(&"read_file"));
|
assert!(names.contains(&"read_file"));
|
||||||
|
assert!(names.contains(&"WebFetch"));
|
||||||
|
assert!(names.contains(&"WebSearch"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -247,4 +700,167 @@ mod tests {
|
|||||||
let error = execute_tool("nope", &json!({})).expect_err("tool should be rejected");
|
let error = execute_tool("nope", &json!({})).expect_err("tool should be rejected");
|
||||||
assert!(error.contains("unsupported tool"));
|
assert!(error.contains("unsupported tool"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn web_fetch_returns_prompt_aware_summary() {
|
||||||
|
let server = TestServer::spawn(Arc::new(|request_line: &str| {
|
||||||
|
assert!(request_line.starts_with("GET /page "));
|
||||||
|
HttpResponse::html(
|
||||||
|
200,
|
||||||
|
"OK",
|
||||||
|
"<html><head><title>Ignored</title></head><body><h1>Test Page</h1><p>Hello <b>world</b> from local server.</p></body></html>",
|
||||||
|
)
|
||||||
|
}));
|
||||||
|
|
||||||
|
let result = execute_tool(
|
||||||
|
"WebFetch",
|
||||||
|
&json!({
|
||||||
|
"url": format!("http://{}/page", server.addr()),
|
||||||
|
"prompt": "Summarize this page"
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.expect("WebFetch should succeed");
|
||||||
|
|
||||||
|
let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
|
||||||
|
assert_eq!(output["code"], 200);
|
||||||
|
let summary = output["result"].as_str().expect("result string");
|
||||||
|
assert!(summary.contains("Fetched"));
|
||||||
|
assert!(summary.contains("Test Page"));
|
||||||
|
assert!(summary.contains("Hello world from local server"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn web_search_extracts_and_filters_results() {
|
||||||
|
let server = TestServer::spawn(Arc::new(|request_line: &str| {
|
||||||
|
assert!(request_line.contains("GET /search?q=rust+web+search "));
|
||||||
|
HttpResponse::html(
|
||||||
|
200,
|
||||||
|
"OK",
|
||||||
|
r#"
|
||||||
|
<html><body>
|
||||||
|
<a class="result__a" href="https://docs.rs/reqwest">Reqwest docs</a>
|
||||||
|
<a class="result__a" href="https://example.com/blocked">Blocked result</a>
|
||||||
|
</body></html>
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
}));
|
||||||
|
|
||||||
|
std::env::set_var(
|
||||||
|
"CLAWD_WEB_SEARCH_BASE_URL",
|
||||||
|
format!("http://{}/search", server.addr()),
|
||||||
|
);
|
||||||
|
let result = execute_tool(
|
||||||
|
"WebSearch",
|
||||||
|
&json!({
|
||||||
|
"query": "rust web search",
|
||||||
|
"allowed_domains": ["docs.rs"],
|
||||||
|
"blocked_domains": ["example.com"]
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.expect("WebSearch should succeed");
|
||||||
|
std::env::remove_var("CLAWD_WEB_SEARCH_BASE_URL");
|
||||||
|
|
||||||
|
let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
|
||||||
|
assert_eq!(output["query"], "rust web search");
|
||||||
|
let results = output["results"].as_array().expect("results array");
|
||||||
|
let search_result = results
|
||||||
|
.iter()
|
||||||
|
.find(|item| item.get("content").is_some())
|
||||||
|
.expect("search result block present");
|
||||||
|
let content = search_result["content"].as_array().expect("content array");
|
||||||
|
assert_eq!(content.len(), 1);
|
||||||
|
assert_eq!(content[0]["title"], "Reqwest docs");
|
||||||
|
assert_eq!(content[0]["url"], "https://docs.rs/reqwest");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct TestServer {
|
||||||
|
addr: SocketAddr,
|
||||||
|
shutdown: Option<std::sync::mpsc::Sender<()>>,
|
||||||
|
handle: Option<thread::JoinHandle<()>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TestServer {
|
||||||
|
fn spawn(handler: Arc<dyn Fn(&str) -> HttpResponse + Send + Sync + 'static>) -> Self {
|
||||||
|
let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
|
||||||
|
listener
|
||||||
|
.set_nonblocking(true)
|
||||||
|
.expect("set nonblocking listener");
|
||||||
|
let addr = listener.local_addr().expect("local addr");
|
||||||
|
let (tx, rx) = std::sync::mpsc::channel::<()>();
|
||||||
|
|
||||||
|
let handle = thread::spawn(move || loop {
|
||||||
|
if rx.try_recv().is_ok() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
match listener.accept() {
|
||||||
|
Ok((mut stream, _)) => {
|
||||||
|
let mut buffer = [0_u8; 4096];
|
||||||
|
let size = stream.read(&mut buffer).expect("read request");
|
||||||
|
let request = String::from_utf8_lossy(&buffer[..size]).into_owned();
|
||||||
|
let request_line = request.lines().next().unwrap_or_default().to_string();
|
||||||
|
let response = handler(&request_line);
|
||||||
|
stream
|
||||||
|
.write_all(response.to_bytes().as_slice())
|
||||||
|
.expect("write response");
|
||||||
|
}
|
||||||
|
Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
|
||||||
|
thread::sleep(Duration::from_millis(10));
|
||||||
|
}
|
||||||
|
Err(error) => panic!("server accept failed: {error}"),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Self {
|
||||||
|
addr,
|
||||||
|
shutdown: Some(tx),
|
||||||
|
handle: Some(handle),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn addr(&self) -> SocketAddr {
|
||||||
|
self.addr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for TestServer {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if let Some(tx) = self.shutdown.take() {
|
||||||
|
let _ = tx.send(());
|
||||||
|
}
|
||||||
|
if let Some(handle) = self.handle.take() {
|
||||||
|
handle.join().expect("join test server");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct HttpResponse {
|
||||||
|
status: u16,
|
||||||
|
reason: &'static str,
|
||||||
|
content_type: &'static str,
|
||||||
|
body: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HttpResponse {
|
||||||
|
fn html(status: u16, reason: &'static str, body: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
status,
|
||||||
|
reason,
|
||||||
|
content_type: "text/html; charset=utf-8",
|
||||||
|
body: body.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_bytes(&self) -> Vec<u8> {
|
||||||
|
format!(
|
||||||
|
"HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
|
||||||
|
self.status,
|
||||||
|
self.reason,
|
||||||
|
self.content_type,
|
||||||
|
self.body.len(),
|
||||||
|
self.body
|
||||||
|
)
|
||||||
|
.into_bytes()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user