supergit: implementing tree parsing

This implementation is a bit weird, especially because it changes the
API from what it was previously.  This works, for now, but some of the
relationships between types feel a bit bad.  Especially that all
queries have to go via the FileTree, and we can't just give out
objects that represent some part of the tree that are then loaded when
needed.

For now this will work though.  What's still missing is to turn a
Yield::Dir into a new FileTree.
wip/yesman
Katharina Fey 4 years ago committed by Mx Kookie
parent 2a180cc038
commit 948a4a76c1
  1. 3
      apps/servers/octopus/supergit/src/bin/test.rs
  2. 50
      apps/servers/octopus/supergit/src/branch.rs
  3. 6
      apps/servers/octopus/supergit/src/commit.rs
  4. 177
      apps/servers/octopus/supergit/src/files.rs
  5. 8
      apps/servers/octopus/supergit/src/lib.rs
  6. 32
      apps/servers/octopus/supergit/src/repo.rs

@ -22,5 +22,6 @@ fn main() {
let head = main.get_head(); let head = main.get_head();
let tree = head.get_tree(); let tree = head.get_tree();
println!("{:?}", tree.load(""));
} }

@ -4,7 +4,26 @@ use std::{mem, sync::Arc};
/// Abstraction for a branch history slice /// Abstraction for a branch history slice
/// ///
/// Git implements an acyclical graph, where branches can be split,
/// and re-merge later. Traversal always happens from some point
/// onwards, backwards through the history. Because git repositories
/// can get quite large and this is a recursive process, it's very
/// quickly possible to overflow your program stack. To avoid this,
/// `supergit` uses an iterator design to enumerate commits.
/// ///
/// Use the API on this type to specify your starting point. By
/// default, it will be the head of the branch you are looking at.
/// Note: not all branches have names!
///
/// After creating a `BranchIter` you can then call `next()` on it,
/// yielding `BranchCommit` objects. These can either be single
/// commits, or various types of merge commits. Each merge commit
/// yields some set of `Branch` handles, that you can either traverse
/// by building another `BranchIter`.
///
/// A branch iterator is therefore always first-parent, meaning that
/// merged branches can simply be ignored by only ever inspecting the
/// current `Commit` contained by a `BranchCommit`.
#[derive(Clone)] #[derive(Clone)]
pub struct Branch { pub struct Branch {
repo: Arc<Repository>, repo: Arc<Repository>,
@ -56,6 +75,7 @@ impl Branch {
} }
} }
/// Create a branch iterator that stops when reaching a commit
pub fn get_to(&self, commit: HashId) -> BranchIter { pub fn get_to(&self, commit: HashId) -> BranchIter {
BranchIter::new( BranchIter::new(
Arc::clone(&self.repo), Arc::clone(&self.repo),
@ -64,12 +84,10 @@ impl Branch {
) )
} }
/// Get the primary branch history as far back as it goes /// Create a step-limited branch iterator
pub fn get_all(&self) -> BranchIter { ///
BranchIter::new(Arc::clone(&self.repo), self.head.clone(), SegLimit::None) /// This type of iterator is especially useful when combined with
} /// `skip()`, to create a paginated view onto commits.
/// Get a branch segment of a certain length
pub fn get(&self, num: usize) -> BranchIter { pub fn get(&self, num: usize) -> BranchIter {
BranchIter::new( BranchIter::new(
Arc::clone(&self.repo), Arc::clone(&self.repo),
@ -78,7 +96,17 @@ impl Branch {
) )
} }
/// Get the commit pointed at by HEAD /// Create an endless branch iterator
///
/// While the creation of the iterator is instantanious, actually
/// enumerating all commits in a repository can be quite
/// computationally intensive and is almost never what you
/// actually want.
pub fn get_all(&self) -> BranchIter {
BranchIter::new(Arc::clone(&self.repo), self.head.clone(), SegLimit::None)
}
/// Get the current HEAD commit
pub fn get_head(&self) -> Commit { pub fn get_head(&self) -> Commit {
Commit::new(&self.repo, self.head.clone()).unwrap() Commit::new(&self.repo, self.head.clone()).unwrap()
} }
@ -89,10 +117,12 @@ impl Branch {
} }
} }
/// A branch segment iterator /// A branch slice iterator, created via `Branch` handle
/// ///
/// Each iterator is first-parent, but will notify you about a split /// This iterator yields `BranchCommit` objects, that can either be
/// parent by setting /// simple commits, or various types of merge commits with new Branch
/// handles. This means that without explicitly branching, this
/// iterator is first-parent.
pub struct BranchIter { pub struct BranchIter {
repo: Arc<Repository>, repo: Arc<Repository>,
curr: Option<HashId>, curr: Option<HashId>,

@ -26,7 +26,7 @@ impl Commit {
self.id.to_string() self.id.to_string()
} }
/// Get the summary line as a utf-7 string /// Get the summary line as a utf-8 string
pub fn summary(&self) -> String { pub fn summary(&self) -> String {
self.find().summary().unwrap().into() self.find().summary().unwrap().into()
} }
@ -55,6 +55,10 @@ impl Commit {
.and_then(|c| Self::new(&self.repo, c.id().into())) .and_then(|c| Self::new(&self.repo, c.id().into()))
} }
/// Get the set of parents as a vector
///
/// Use this function if you suspect a commit has more than one
/// parent.
pub fn parents(&self) -> Vec<Commit> { pub fn parents(&self) -> Vec<Commit> {
self.find() self.find()
.parents() .parents()

@ -1,13 +1,13 @@
use crate::{Branch, BranchIter, Commit, HashId}; use crate::{Branch, BranchIter, Commit, HashId};
use git2::{ObjectType, TreeWalkMode, TreeWalkResult};
use atomptr::AtomPtr; use atomptr::AtomPtr;
use git2::{ObjectType, TreeWalkMode, TreeWalkResult};
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::{path::PathBuf, sync::Arc}; use std::{path::PathBuf, sync::Arc};
/// A tree of files /// A tree of files
pub struct FileTree { pub struct FileTree {
repo: Arc<git2::Repository>, repo: Arc<git2::Repository>,
tree: AtomPtr<BTreeMap<String, TreeEntry>>, tree: AtomPtr<BTreeMap<String, Arc<TreeEntry>>>,
} }
impl FileTree { impl FileTree {
@ -23,39 +23,78 @@ impl FileTree {
/// Parse a tree from a specific commit /// Parse a tree from a specific commit
pub(crate) fn parse(self: Arc<Self>, commit: HashId) -> Arc<Self> { pub(crate) fn parse(self: Arc<Self>, commit: HashId) -> Arc<Self> {
let mut new_tree = BTreeMap::new(); let mut new_tree = BTreeMap::new();
let tree = (&self.repo) let tree = (&self.repo)
.find_commit(commit.to_oid()) .find_commit(commit.to_oid())
.unwrap() .unwrap()
.tree() .tree()
.unwrap(); .unwrap();
tree.walk(TreeWalkMode::PreOrder, |what, entry| { tree.walk(TreeWalkMode::PreOrder, |p, entry| {
let path_segs: Vec<_> = what.split("/").filter(|s| s != &"").collect(); let path_segs: Vec<_> = p.split("/").filter(|s| s != &"").collect();
let path = if path_segs.len() == 0 { let path = if path_segs.len() == 0 {
None None
} else { } else {
Some(path_segs) Some(path_segs)
}; };
println!("{:?} {}", path, entry.name().unwrap()); let te = TreeEntry::generate(path, entry);
new_tree.insert(te.path(), Arc::new(te));
TreeWalkResult::Ok TreeWalkResult::Ok
}) })
.unwrap(); .unwrap();
// Add a special entry for the root of the repo
new_tree.insert(
"".into(),
Arc::new(TreeEntry::Dir(Directory {
id: tree.id().into(),
path: "".into(),
name: "".into(),
})),
);
// This is needed to make borrowchk shut up
drop(tree); drop(tree);
// Atomicly swap new tree into place // Atomicly swap new tree into place
self.tree.swap(new_tree); self.tree.swap(new_tree);
self self
} }
fn get_entry(&self, path: &str) -> Option<Arc<TreeEntry>> {
self.tree.get_ref().get(path).map(|e| Arc::clone(&e))
}
/// Load a file entry in this `FileTree` from disk
///
/// When calling this function on a directory, nothing will happen
/// (returns `None`), because directories can't be loaded. If you
/// want to get a list of children for a directory, use
/// [`FileTree::enumerate()`]() instead!
pub fn load(&self, path: &str) -> Option<Yield> {
self.get_entry(path).and_then(|e| e.load(&self.repo))
}
} }
/// An entry in a file tree /// Data yielded from loading a part of the file tree
///
/// This type is returned when fetching a path via `FileTree::load()`,
/// and can either be a single file read into memory, or an
/// enumeration of direct children of a directory.
/// ///
/// It's variants can either be a file (leaf), or a subtree, with it's /// To get all children of a subtree, use `Yield::into_tree()` to
/// own path handles, and children. /// create a new, recursive `FileTree` to enumerate.
pub enum TreeEntry { #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum Yield {
/// Load a single file into a buffer
File(Vec<u8>),
/// Enumerate children in a directory
Dir(Vec<String>),
}
enum TreeEntry {
/// A single file /// A single file
File(File), File(File),
/// A sub-tree /// A sub-tree
@ -63,74 +102,114 @@ pub enum TreeEntry {
} }
impl TreeEntry { impl TreeEntry {
/// Create a tree entry from a path and `git2::TreeEntry` fn generate(path_segments: Option<Vec<&str>>, entry: &git2::TreeEntry) -> Self {
fn generate(root: PathBuf, path_segments: Option<Vec<String>>, entry: git2::TreeEntry) -> Self {
let path = path_segments.map_or("".into(), |p| path_segs_join(p)); let path = path_segments.map_or("".into(), |p| path_segs_join(p));
let id = entry.id().into();
let name = entry.name().unwrap().into();
match entry.kind() { match entry.kind() {
Some(ObjectType::Blob) => Self::File(File::new(root, path)), Some(ObjectType::Blob) => Self::File(File::new(id, path, name)),
Some(ObjectType::Tree) => Self::Dir(Directory::new(root, path)), Some(ObjectType::Tree) => Self::Dir(Directory::new(id, path, name)),
_ => unimplemented!(), _ => unimplemented!(),
} }
} }
/// Load this tree entry from disk, if it is a file fn load(&self, repo: &Arc<git2::Repository>) -> Option<Yield> {
/// let id = self.id();
/// When calling this function on a directory, nothing will
/// happen, because directories can't be loaded. If you want to
/// get a list of children for a directory, use
/// [`FileTree::enumerate()`]() instead!
pub fn load(&self) -> Option<Vec<u8>> {
if !self.is_file() {
return None;
}
let obj = match self {
Self::File(ref f) => repo
.find_blob(id.into())
.ok()
.map(|b| Yield::File(b.content().into())),
Self::Dir(ref d) => repo
.find_tree(id.into())
.ok()
.map(|tree| {
let mut children = vec![];
// Iterate the tree, but only as long as there are no
// additional path segments
tree.walk(TreeWalkMode::PreOrder, |p, entry| {
let path_segs: Vec<_> = p.split("/").filter(|s| s != &"").collect();
if path_segs.len() > 0 {
TreeWalkResult::Skip
} else {
// Take the current tree path, and append the
// name of whatever we're currently iterating
// over is
let path = PathBuf::new().join(self.path()).join(entry.name().unwrap());
children.push(path.as_path().to_str().unwrap().into());
TreeWalkResult::Ok
}
});
children
})
.map(|c| Yield::Dir(c)),
}
} }
/// Check if this tree entry is a file fn is_file(&self) -> bool {
pub fn is_file(&self) -> bool {
match self { match self {
Self::File(_) => true, Self::File(_) => true,
Self::Dir(_) => false, Self::Dir(_) => false,
} }
} }
fn id(&self) -> HashId {
match self {
Self::File(ref f) => f.id.clone(),
Self::Dir(ref d) => d.id.clone(),
}
}
/// Get the repo-internal path (including name)
///
/// This is used to index files in a file tree, to allow O(1)
/// access to deeply nested items.
fn path(&self) -> String {
match self {
Self::File(ref f) => PathBuf::new().join(&f.path).join(&f.name),
Self::Dir(ref d) => PathBuf::new().join(&d.path).join(&d.name),
}
.as_path()
.to_str()
.unwrap()
.into()
}
} }
/// A file to have ever existed in a git repo struct File {
pub struct File { id: HashId,
root: PathBuf,
path: String, path: String,
name: String,
} }
impl File { impl File {
pub(crate) fn new(root: PathBuf, path: String) -> Self { fn new(id: HashId, path: String, name: String) -> Self {
Self { root, path } Self { id, path, name }
}
/// Get the history of a file from a branch iterator
pub fn get_history(&self, branch: BranchIter) -> Vec<Commit> {
todo!()
} }
} }
/// A subdirectory in a file tree struct Directory {
/// id: HashId,
/// A directory has a set of children, which can either be Files, or
/// other directories. Many of the functions to retrieve metadata
/// (such as the last commit, count, etc) will be deferred to the
/// children of this directory.
pub struct Directory {
root: PathBuf,
path: String, path: String,
name: String,
} }
impl Directory { impl Directory {
pub(crate) fn new(root: PathBuf, path: String) -> Self { fn new(id: HashId, path: String, name: String) -> Self {
Self { root, path } Self { id, path, name }
}
fn enumerate(&self, repo: git2::Repository) -> Vec<String> {
vec![]
} }
} }
////////////////////////////////
/// Take a vector of path segments, and turn it into a valid offset path /// Take a vector of path segments, and turn it into a valid offset path
/// ///
/// There are tests to make sure this function works properly. /// There are tests to make sure this function works properly.
@ -139,7 +218,7 @@ impl Directory {
/// * vec![] -> "" /// * vec![] -> ""
/// * vec!["foo"] -> "foo" /// * vec!["foo"] -> "foo"
/// * vec!["foo", "bar", "baz"] -> "foo/bar/baz" /// * vec!["foo", "bar", "baz"] -> "foo/bar/baz"
fn path_segs_join(segments: Vec<String>) -> String { fn path_segs_join(segments: Vec<&str>) -> String {
segments segments
.into_iter() .into_iter()
.fold(PathBuf::new(), |buf, seg| buf.join(seg)) .fold(PathBuf::new(), |buf, seg| buf.join(seg))

@ -5,8 +5,10 @@
//! repository, consider using that library instead. //! repository, consider using that library instead.
//! //!
//! supergit aims to make queries into a git repo as typed and easy as //! supergit aims to make queries into a git repo as typed and easy as
//! possible. Start by creating a [`Repository`](), and enumerating //! possible. Start by creating a
//! or fetching [`Branch`]()es that you are interested in. //! [`Repository`](struct.Repository.html), and enumerating or
//! fetching [`Branch`](struct.Branch.html)es that you are interested
//! in.
//! //!
//! Unlike `libgit2`, this library can resolve reverse dependencies //! Unlike `libgit2`, this library can resolve reverse dependencies
//! between files, and their commit history. Some of these functions //! between files, and their commit history. Some of these functions
@ -27,7 +29,7 @@ pub(crate) use repo::HashId;
pub use repo::Repository; pub use repo::Repository;
mod files; mod files;
pub use files::{File, FileTree}; pub use files::{Yield, FileTree};
use async_std::sync::{Arc, RwLock}; use async_std::sync::{Arc, RwLock};
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};

@ -2,7 +2,7 @@
use crate::{Branch, BranchCommit}; use crate::{Branch, BranchCommit};
use git2::{self, Oid}; use git2::{self, Oid};
use std::sync::Arc; use std::{fmt, sync::Arc};
pub type GitResult<T> = Result<T, GitError>; pub type GitResult<T> = Result<T, GitError>;
@ -10,6 +10,12 @@ pub type GitResult<T> = Result<T, GitError>;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct HashId(String); pub struct HashId(String);
impl fmt::Display for HashId {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl HashId { impl HashId {
pub fn to_oid(&self) -> Oid { pub fn to_oid(&self) -> Oid {
self.clone().into() self.clone().into()
@ -63,6 +69,7 @@ pub struct Repository {
} }
impl Repository { impl Repository {
/// Open a repository read-only at a specific path
pub fn open(path: &str) -> GitResult<Self> { pub fn open(path: &str) -> GitResult<Self> {
Ok(Self { Ok(Self {
inner: Arc::new(git2::Repository::open(path)?), inner: Arc::new(git2::Repository::open(path)?),
@ -71,9 +78,12 @@ impl Repository {
/// Parse branch data from repository /// Parse branch data from repository
/// ///
/// If you only care about a single branch, you can also use the
/// convenience function `get_branch()`.
///
/// ## Panics /// ## Panics
/// ///
/// If there is an error around getting the name, or head commit. /// This function can panic when branch metadata is missing.
pub fn branches(&self) -> GitResult<Vec<Branch>> { pub fn branches(&self) -> GitResult<Vec<Branch>> {
Ok(self Ok(self
.inner .inner
@ -88,11 +98,17 @@ impl Repository {
.collect()) .collect())
} }
/// Get the files touched by a commit /// Get a single branch by name
pub fn get_files_for(&self, id: HashId) -> GitResult<Vec<()>> { ///
let c = self.inner.find_commit(id.into())?; /// This function will enumerate all branches, and then select the
let tree = c.tree()?; /// desired one. If you want to make repeated queries onto the
/// branch set, it's recommended you call `branches()`, and cache
todo!() /// the data yourself.
pub fn get_branch(&self, name: String) -> Option<Branch> {
self.branches().ok().and_then(|ok| {
ok.into_iter()
.filter(|b| b.name().is_some())
.find(|b| &b.name().unwrap() == &name)
})
} }
} }

Loading…
Cancel
Save