about summary refs log tree commit diff
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2023-01-11T15·14+0300
committerclbot <clbot@tvl.fyi>2023-01-11T16·03+0000
commit3045645df07ffdb54f9d2a11ee2e41e31999986f (patch)
treeee768e7a42e4addd6c5f9591163b8b8a683d68ca
parent9382afdb0d7258cee8e7f20d646a85076f38011a (diff)
feat(tvix/cli): implement initial refscan module r/5643
This module implements a ReferenceScanner struct which uses the
aho_corasick crate to scan string inputs for known, non-overlapping
candidates (store paths, in our case).

I experimented with several different APIs, and landed on this version
with an initial accumulator in the scanner. The scanner is
instantiated from the candidates and "fed" all the strings, then
consumed by the caller to retrieve the result.

Right now only things that look vaguely like bytestrings can be fed to
the scanner, there is no streaming support in the API yet.

Change-Id: I7782f0f0df5fc64bccd813aa14712f5525b0168c
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7808
Autosubmit: tazjin <tazjin@tvl.su>
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
-rw-r--r--tvix/Cargo.lock1
-rw-r--r--tvix/Cargo.nix4
-rw-r--r--tvix/cli/Cargo.toml1
-rw-r--r--tvix/cli/src/main.rs1
-rw-r--r--tvix/cli/src/refscan.rs97
5 files changed, 104 insertions, 0 deletions
diff --git a/tvix/Cargo.lock b/tvix/Cargo.lock
index 197bee75e6..a5ea4b82e8 100644
--- a/tvix/Cargo.lock
+++ b/tvix/Cargo.lock
@@ -2177,6 +2177,7 @@ checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
 name = "tvix-cli"
 version = "0.1.0"
 dependencies = [
+ "aho-corasick",
  "clap 4.0.32",
  "dirs",
  "rustyline",
diff --git a/tvix/Cargo.nix b/tvix/Cargo.nix
index 9f13a16073..0f2b6bdd98 100644
--- a/tvix/Cargo.nix
+++ b/tvix/Cargo.nix
@@ -6478,6 +6478,10 @@ rec {
           else ./cli;
         dependencies = [
           {
+            name = "aho-corasick";
+            packageId = "aho-corasick";
+          }
+          {
             name = "clap";
             packageId = "clap 4.0.32";
             features = [ "derive" "env" ];
diff --git a/tvix/cli/Cargo.toml b/tvix/cli/Cargo.toml
index 099002353d..f3324f2611 100644
--- a/tvix/cli/Cargo.toml
+++ b/tvix/cli/Cargo.toml
@@ -13,3 +13,4 @@ rustyline = "10.0.0"
 clap = { version = "4.0", features = ["derive", "env"] }
 dirs = "4.0.0"
 smol_str = "0.1"
+aho-corasick = "0.7"
diff --git a/tvix/cli/src/main.rs b/tvix/cli/src/main.rs
index eec4d8bbb2..0f837b346c 100644
--- a/tvix/cli/src/main.rs
+++ b/tvix/cli/src/main.rs
@@ -1,4 +1,5 @@
 mod nix_compat;
+mod refscan;
 
 use std::{fs, path::PathBuf};
 
diff --git a/tvix/cli/src/refscan.rs b/tvix/cli/src/refscan.rs
new file mode 100644
index 0000000000..76857142e8
--- /dev/null
+++ b/tvix/cli/src/refscan.rs
@@ -0,0 +1,97 @@
+//! Simple scanner for non-overlapping, known references of Nix store paths in a
+//! given string.
+//!
+//! This is used for determining build references (see
+//! //tvix/eval/docs/build-references.md for more details).
+//!
+//! The scanner itself is an Aho-Corasick automaton, using the `aho-corasick`
+//! crate.
+
+use aho_corasick::AhoCorasick;
+use std::collections::BTreeSet;
+
+/// Represents a "primed" reference scanner with an automaton that knows the set
+/// of store paths to scan for.
+pub struct ReferenceScanner<'c, 's> {
+    candidates: &'c [&'s str],
+    searcher: AhoCorasick,
+    matches: BTreeSet<&'s str>,
+}
+
+impl<'c, 's> ReferenceScanner<'c, 's> {
+    /// Construct a new `ReferenceScanner` that knows how to scan for the given
+    /// candidate store paths.
+    pub fn new(candidates: &'c [&'s str]) -> Self {
+        let searcher = AhoCorasick::new_auto_configured(candidates);
+
+        ReferenceScanner {
+            searcher,
+            candidates,
+            matches: Default::default(),
+        }
+    }
+
+    /// Scan the given string for all non-overlapping matches and collect them
+    /// in the scanner.
+    pub fn scan_str<H: AsRef<[u8]>>(&mut self, haystack: H) {
+        for m in self.searcher.find_iter(&haystack) {
+            let needle = self.candidates[m.pattern()];
+            self.matches.insert(needle);
+        }
+    }
+
+    /// Finalise the reference scanner and return the resulting matches.
+    pub fn finalise(self) -> BTreeSet<&'s str> {
+        self.matches
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // The actual derivation of `nixpkgs.hello`.
+    const HELLO_DRV: &'static str = r#"Derive([("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1","","")],[("/nix/store/6z1jfnqqgyqr221zgbpm30v91yfj3r45-bash-5.1-p16.drv",["out"]),("/nix/store/ap9g09fxbicj836zm88d56dn3ff4clxl-stdenv-linux.drv",["out"]),("/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",["out"])],["/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],"x86_64-linux","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash",["-e","/nix/store/9krlzvny65gdc8s7kpb6lkx8cd02c25b-default-builder.sh"],[("buildInputs",""),("builder","/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16/bin/bash"),("cmakeFlags",""),("configureFlags",""),("depsBuildBuild",""),("depsBuildBuildPropagated",""),("depsBuildTarget",""),("depsBuildTargetPropagated",""),("depsHostHost",""),("depsHostHostPropagated",""),("depsTargetTarget",""),("depsTargetTargetPropagated",""),("doCheck","1"),("doInstallCheck",""),("mesonFlags",""),("name","hello-2.12.1"),("nativeBuildInputs",""),("out","/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1"),("outputs","out"),("patches",""),("pname","hello"),("propagatedBuildInputs",""),("propagatedNativeBuildInputs",""),("src","/nix/store/pa10z4ngm0g83kx9mssrqzz30s84vq7k-hello-2.12.1.tar.gz"),("stdenv","/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux"),("strictDeps",""),("system","x86_64-linux"),("version","2.12.1")])"#;
+
+    #[test]
+    fn test_empty() {
+        let mut scanner = ReferenceScanner::new(&[]);
+        scanner.scan_str("hello world");
+        assert!(scanner.finalise().is_empty());
+    }
+
+    #[test]
+    fn test_single_match() {
+        let mut scanner =
+            ReferenceScanner::new(&["/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16"]);
+        scanner.scan_str(HELLO_DRV);
+
+        let result = scanner.finalise();
+
+        assert_eq!(result.len(), 1);
+        assert!(result.contains("/nix/store/4xw8n979xpivdc46a9ndcvyhwgif00hz-bash-5.1-p16"));
+    }
+
+    #[test]
+    fn test_multiple_matches() {
+        let candidates = &[
+            // these exist in the drv:
+            "/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1",
+            "/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",
+            "/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux",
+            // this doesn't:
+            "/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv",
+        ];
+
+        let mut scanner = ReferenceScanner::new(candidates);
+        scanner.scan_str(HELLO_DRV);
+
+        let result = scanner.finalise();
+
+        assert_eq!(result.len(), 3);
+
+        for c in candidates[..3].iter() {
+            assert!(result.contains(c));
+        }
+    }
+}