supergit: implementing tree parsing

This implementation is a bit weird, especially because it changes the
API from what it was previously.  This works, for now, but some of the
relationships between types feel a bit bad.  Especially that all
queries have to go via the FileTree, and we can't just give out
objects that represent some part of the tree that are then loaded when
needed.

For now this will work though.  What's still missing is to turn a
Yield::Dir into a new FileTree.
wip/yesman
Katharina Fey 4 years ago committed by Mx Kookie
parent 2a180cc038
commit 948a4a76c1
  1. 3
      apps/servers/octopus/supergit/src/bin/test.rs
  2. 50
      apps/servers/octopus/supergit/src/branch.rs
  3. 6
      apps/servers/octopus/supergit/src/commit.rs
  4. 177
      apps/servers/octopus/supergit/src/files.rs
  5. 8
      apps/servers/octopus/supergit/src/lib.rs
  6. 32
      apps/servers/octopus/supergit/src/repo.rs

@ -22,5 +22,6 @@ fn main() {
let head = main.get_head();
let tree = head.get_tree();
println!("{:?}", tree.load(""));
}

@ -4,7 +4,26 @@ use std::{mem, sync::Arc};
/// Abstraction for a branch history slice
///
/// Git implements an acyclical graph, where branches can be split,
/// and re-merge later. Traversal always happens from some point
/// onwards, backwards through the history. Because git repositories
/// can get quite large and this is a recursive process, it's very
/// quickly possible to overflow your program stack. To avoid this,
/// `supergit` uses an iterator design to enumerate commits.
///
/// Use the API on this type to specify your starting point. By
/// default, it will be the head of the branch you are looking at.
/// Note: not all branches have names!
///
/// After creating a `BranchIter` you can then call `next()` on it,
/// yielding `BranchCommit` objects. These can either be single
/// commits, or various types of merge commits. Each merge commit
/// yields some set of `Branch` handles, that you can either traverse
/// by building another `BranchIter`.
///
/// A branch iterator is therefore always first-parent, meaning that
/// merged branches can simply be ignored by only ever inspecting the
/// current `Commit` contained by a `BranchCommit`.
#[derive(Clone)]
pub struct Branch {
repo: Arc<Repository>,
@ -56,6 +75,7 @@ impl Branch {
}
}
/// Create a branch iterator that stops when reaching a commit
pub fn get_to(&self, commit: HashId) -> BranchIter {
BranchIter::new(
Arc::clone(&self.repo),
@ -64,12 +84,10 @@ impl Branch {
)
}
/// Get the primary branch history as far back as it goes
pub fn get_all(&self) -> BranchIter {
BranchIter::new(Arc::clone(&self.repo), self.head.clone(), SegLimit::None)
}
/// Get a branch segment of a certain length
/// Create a step-limited branch iterator
///
/// This type of iterator is especially useful when combined with
/// `skip()`, to create a paginated view onto commits.
pub fn get(&self, num: usize) -> BranchIter {
BranchIter::new(
Arc::clone(&self.repo),
@ -78,7 +96,17 @@ impl Branch {
)
}
/// Get the commit pointed at by HEAD
/// Create an endless branch iterator
///
/// While the creation of the iterator is instantanious, actually
/// enumerating all commits in a repository can be quite
/// computationally intensive and is almost never what you
/// actually want.
pub fn get_all(&self) -> BranchIter {
BranchIter::new(Arc::clone(&self.repo), self.head.clone(), SegLimit::None)
}
/// Get the current HEAD commit
pub fn get_head(&self) -> Commit {
Commit::new(&self.repo, self.head.clone()).unwrap()
}
@ -89,10 +117,12 @@ impl Branch {
}
}
/// A branch segment iterator
/// A branch slice iterator, created via `Branch` handle
///
/// Each iterator is first-parent, but will notify you about a split
/// parent by setting
/// This iterator yields `BranchCommit` objects, that can either be
/// simple commits, or various types of merge commits with new Branch
/// handles. This means that without explicitly branching, this
/// iterator is first-parent.
pub struct BranchIter {
repo: Arc<Repository>,
curr: Option<HashId>,

@ -26,7 +26,7 @@ impl Commit {
self.id.to_string()
}
/// Get the summary line as a utf-7 string
/// Get the summary line as a utf-8 string
pub fn summary(&self) -> String {
self.find().summary().unwrap().into()
}
@ -55,6 +55,10 @@ impl Commit {
.and_then(|c| Self::new(&self.repo, c.id().into()))
}
/// Get the set of parents as a vector
///
/// Use this function if you suspect a commit has more than one
/// parent.
pub fn parents(&self) -> Vec<Commit> {
self.find()
.parents()

@ -1,13 +1,13 @@
use crate::{Branch, BranchIter, Commit, HashId};
use git2::{ObjectType, TreeWalkMode, TreeWalkResult};
use atomptr::AtomPtr;
use git2::{ObjectType, TreeWalkMode, TreeWalkResult};
use std::collections::BTreeMap;
use std::{path::PathBuf, sync::Arc};
/// A tree of files
pub struct FileTree {
repo: Arc<git2::Repository>,
tree: AtomPtr<BTreeMap<String, TreeEntry>>,
tree: AtomPtr<BTreeMap<String, Arc<TreeEntry>>>,
}
impl FileTree {
@ -23,39 +23,78 @@ impl FileTree {
/// Parse a tree from a specific commit
pub(crate) fn parse(self: Arc<Self>, commit: HashId) -> Arc<Self> {
let mut new_tree = BTreeMap::new();
let tree = (&self.repo)
.find_commit(commit.to_oid())
.unwrap()
.tree()
.unwrap();
tree.walk(TreeWalkMode::PreOrder, |what, entry| {
let path_segs: Vec<_> = what.split("/").filter(|s| s != &"").collect();
tree.walk(TreeWalkMode::PreOrder, |p, entry| {
let path_segs: Vec<_> = p.split("/").filter(|s| s != &"").collect();
let path = if path_segs.len() == 0 {
None
} else {
Some(path_segs)
};
println!("{:?} {}", path, entry.name().unwrap());
let te = TreeEntry::generate(path, entry);
new_tree.insert(te.path(), Arc::new(te));
TreeWalkResult::Ok
})
.unwrap();
// Add a special entry for the root of the repo
new_tree.insert(
"".into(),
Arc::new(TreeEntry::Dir(Directory {
id: tree.id().into(),
path: "".into(),
name: "".into(),
})),
);
// This is needed to make borrowchk shut up
drop(tree);
// Atomicly swap new tree into place
self.tree.swap(new_tree);
self
}
fn get_entry(&self, path: &str) -> Option<Arc<TreeEntry>> {
self.tree.get_ref().get(path).map(|e| Arc::clone(&e))
}
/// Load a file entry in this `FileTree` from disk
///
/// When calling this function on a directory, nothing will happen
/// (returns `None`), because directories can't be loaded. If you
/// want to get a list of children for a directory, use
/// [`FileTree::enumerate()`]() instead!
pub fn load(&self, path: &str) -> Option<Yield> {
self.get_entry(path).and_then(|e| e.load(&self.repo))
}
}
/// An entry in a file tree
/// Data yielded from loading a part of the file tree
///
/// This type is returned when fetching a path via `FileTree::load()`,
/// and can either be a single file read into memory, or an
/// enumeration of direct children of a directory.
///
/// It's variants can either be a file (leaf), or a subtree, with it's
/// own path handles, and children.
pub enum TreeEntry {
/// To get all children of a subtree, use `Yield::into_tree()` to
/// create a new, recursive `FileTree` to enumerate.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum Yield {
/// Load a single file into a buffer
File(Vec<u8>),
/// Enumerate children in a directory
Dir(Vec<String>),
}
enum TreeEntry {
/// A single file
File(File),
/// A sub-tree
@ -63,74 +102,114 @@ pub enum TreeEntry {
}
impl TreeEntry {
/// Create a tree entry from a path and `git2::TreeEntry`
fn generate(root: PathBuf, path_segments: Option<Vec<String>>, entry: git2::TreeEntry) -> Self {
fn generate(path_segments: Option<Vec<&str>>, entry: &git2::TreeEntry) -> Self {
let path = path_segments.map_or("".into(), |p| path_segs_join(p));
let id = entry.id().into();
let name = entry.name().unwrap().into();
match entry.kind() {
Some(ObjectType::Blob) => Self::File(File::new(root, path)),
Some(ObjectType::Tree) => Self::Dir(Directory::new(root, path)),
Some(ObjectType::Blob) => Self::File(File::new(id, path, name)),
Some(ObjectType::Tree) => Self::Dir(Directory::new(id, path, name)),
_ => unimplemented!(),
}
}
/// Load this tree entry from disk, if it is a file
///
/// When calling this function on a directory, nothing will
/// happen, because directories can't be loaded. If you want to
/// get a list of children for a directory, use
/// [`FileTree::enumerate()`]() instead!
pub fn load(&self) -> Option<Vec<u8>> {
if !self.is_file() {
return None;
}
fn load(&self, repo: &Arc<git2::Repository>) -> Option<Yield> {
let id = self.id();
let obj =
match self {
Self::File(ref f) => repo
.find_blob(id.into())
.ok()
.map(|b| Yield::File(b.content().into())),
Self::Dir(ref d) => repo
.find_tree(id.into())
.ok()
.map(|tree| {
let mut children = vec![];
// Iterate the tree, but only as long as there are no
// additional path segments
tree.walk(TreeWalkMode::PreOrder, |p, entry| {
let path_segs: Vec<_> = p.split("/").filter(|s| s != &"").collect();
if path_segs.len() > 0 {
TreeWalkResult::Skip
} else {
// Take the current tree path, and append the
// name of whatever we're currently iterating
// over is
let path = PathBuf::new().join(self.path()).join(entry.name().unwrap());
children.push(path.as_path().to_str().unwrap().into());
TreeWalkResult::Ok
}
});
children
})
.map(|c| Yield::Dir(c)),
}
}
/// Check if this tree entry is a file
pub fn is_file(&self) -> bool {
fn is_file(&self) -> bool {
match self {
Self::File(_) => true,
Self::Dir(_) => false,
}
}
fn id(&self) -> HashId {
match self {
Self::File(ref f) => f.id.clone(),
Self::Dir(ref d) => d.id.clone(),
}
}
/// Get the repo-internal path (including name)
///
/// This is used to index files in a file tree, to allow O(1)
/// access to deeply nested items.
fn path(&self) -> String {
match self {
Self::File(ref f) => PathBuf::new().join(&f.path).join(&f.name),
Self::Dir(ref d) => PathBuf::new().join(&d.path).join(&d.name),
}
.as_path()
.to_str()
.unwrap()
.into()
}
}
/// A file to have ever existed in a git repo
pub struct File {
root: PathBuf,
struct File {
id: HashId,
path: String,
name: String,
}
impl File {
pub(crate) fn new(root: PathBuf, path: String) -> Self {
Self { root, path }
}
/// Get the history of a file from a branch iterator
pub fn get_history(&self, branch: BranchIter) -> Vec<Commit> {
todo!()
fn new(id: HashId, path: String, name: String) -> Self {
Self { id, path, name }
}
}
/// A subdirectory in a file tree
///
/// A directory has a set of children, which can either be Files, or
/// other directories. Many of the functions to retrieve metadata
/// (such as the last commit, count, etc) will be deferred to the
/// children of this directory.
pub struct Directory {
root: PathBuf,
struct Directory {
id: HashId,
path: String,
name: String,
}
impl Directory {
pub(crate) fn new(root: PathBuf, path: String) -> Self {
Self { root, path }
fn new(id: HashId, path: String, name: String) -> Self {
Self { id, path, name }
}
fn enumerate(&self, repo: git2::Repository) -> Vec<String> {
vec![]
}
}
////////////////////////////////
/// Take a vector of path segments, and turn it into a valid offset path
///
/// There are tests to make sure this function works properly.
@ -139,7 +218,7 @@ impl Directory {
/// * vec![] -> ""
/// * vec!["foo"] -> "foo"
/// * vec!["foo", "bar", "baz"] -> "foo/bar/baz"
fn path_segs_join(segments: Vec<String>) -> String {
fn path_segs_join(segments: Vec<&str>) -> String {
segments
.into_iter()
.fold(PathBuf::new(), |buf, seg| buf.join(seg))

@ -5,8 +5,10 @@
//! repository, consider using that library instead.
//!
//! supergit aims to make queries into a git repo as typed and easy as
//! possible. Start by creating a [`Repository`](), and enumerating
//! or fetching [`Branch`]()es that you are interested in.
//! possible. Start by creating a
//! [`Repository`](struct.Repository.html), and enumerating or
//! fetching [`Branch`](struct.Branch.html)es that you are interested
//! in.
//!
//! Unlike `libgit2`, this library can resolve reverse dependencies
//! between files, and their commit history. Some of these functions
@ -27,7 +29,7 @@ pub(crate) use repo::HashId;
pub use repo::Repository;
mod files;
pub use files::{File, FileTree};
pub use files::{Yield, FileTree};
use async_std::sync::{Arc, RwLock};
use std::sync::atomic::{AtomicUsize, Ordering};

@ -2,7 +2,7 @@
use crate::{Branch, BranchCommit};
use git2::{self, Oid};
use std::sync::Arc;
use std::{fmt, sync::Arc};
pub type GitResult<T> = Result<T, GitError>;
@ -10,6 +10,12 @@ pub type GitResult<T> = Result<T, GitError>;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct HashId(String);
impl fmt::Display for HashId {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl HashId {
pub fn to_oid(&self) -> Oid {
self.clone().into()
@ -63,6 +69,7 @@ pub struct Repository {
}
impl Repository {
/// Open a repository read-only at a specific path
pub fn open(path: &str) -> GitResult<Self> {
Ok(Self {
inner: Arc::new(git2::Repository::open(path)?),
@ -71,9 +78,12 @@ impl Repository {
/// Parse branch data from repository
///
/// If you only care about a single branch, you can also use the
/// convenience function `get_branch()`.
///
/// ## Panics
///
/// If there is an error around getting the name, or head commit.
/// This function can panic when branch metadata is missing.
pub fn branches(&self) -> GitResult<Vec<Branch>> {
Ok(self
.inner
@ -88,11 +98,17 @@ impl Repository {
.collect())
}
/// Get the files touched by a commit
pub fn get_files_for(&self, id: HashId) -> GitResult<Vec<()>> {
let c = self.inner.find_commit(id.into())?;
let tree = c.tree()?;
todo!()
/// Get a single branch by name
///
/// This function will enumerate all branches, and then select the
/// desired one. If you want to make repeated queries onto the
/// branch set, it's recommended you call `branches()`, and cache
/// the data yourself.
pub fn get_branch(&self, name: String) -> Option<Branch> {
self.branches().ok().and_then(|ok| {
ok.into_iter()
.filter(|b| b.name().is_some())
.find(|b| &b.name().unwrap() == &name)
})
}
}

Loading…
Cancel
Save