Merge the concept of scanning and crawling into crawling only, to reduce startup expense. Index: beagled/FileSystemQueryable/CrawlTask.cs =================================================================== RCS file: /cvs/gnome/beagle/beagled/FileSystemQueryable/CrawlTask.cs,v retrieving revision 1.6 diff -u -B -p -r1.6 CrawlTask.cs --- beagled/FileSystemQueryable/CrawlTask.cs 18 Apr 2005 23:50:15 -0000 1.6 +++ beagled/FileSystemQueryable/CrawlTask.cs 20 Jul 2005 15:56:53 -0000 @@ -37,6 +37,11 @@ namespace Beagle.Daemon.FileSystemQuerya FileSystemQueryable queryable; IIndexableGenerator current_generator; + private bool active = true; + public bool Active { + get { return active; } + } + public CrawlTask (FileSystemQueryable queryable) { this.queryable = queryable; @@ -71,33 +76,19 @@ namespace Beagle.Daemon.FileSystemQuerya current_generator = null; FileSystemModel.Directory next_dir = model.GetNextDirectoryToCrawl (); - if (next_dir == null) + if (next_dir == null) { + this.active = false; return; + } - int uncrawled, dirty; - model.GetUncrawledCounts (out uncrawled, out dirty); - this.Description = String.Format ("last={0}, uncrawled={1}, dirty={2})", - next_dir.FullName, uncrawled, dirty); + int uncrawled = model.GetUncrawledCounts (); + this.Description = String.Format ("last={0}, uncrawled={1}", + next_dir.FullName, uncrawled); Logger.Log.Debug ("Crawl Task Scheduling {0} (state={1})", next_dir.FullName, next_dir.State); - // Check the next directory for new subdirectories. If we find any, - // add them to the model. - try { - foreach (DirectoryInfo subdir in DirectoryWalker.GetDirectoryInfos (next_dir.FullName)) { - Logger.Log.Debug ("Looking at {0} in {1}", subdir.Name, next_dir.FullName); - if (! next_dir.HasChildWithName (subdir.Name) - && ! model.Ignore (subdir.FullName)) { - Logger.Log.Debug ("Found new subdir {0} under {1}", - subdir.Name, next_dir.FullName); - model.AddChild (next_dir, subdir.Name); - } - } - } catch (DirectoryNotFoundException ex) { - Logger.Log.Debug ("Caught exception, deleting {0} from model", next_dir.FullName); - model.Delete (next_dir); + if (! model.ScanSubdirs (next_dir)) next_dir = null; - } // We want this task to get re-scheduled after it is run. Reschedule = true; @@ -109,6 +100,9 @@ namespace Beagle.Daemon.FileSystemQuerya if (next_dir == null) return; + + // We no longer care about the Open event + model.DropOpenWatch (next_dir); // Set up a task group to mark the time on the directory // after we finish crawling it. Index: beagled/FileSystemQueryable/FileSystemModel.cs =================================================================== RCS file: /cvs/gnome/beagle/beagled/FileSystemQueryable/FileSystemModel.cs,v retrieving revision 1.24 diff -u -B -p -r1.24 FileSystemModel.cs --- beagled/FileSystemQueryable/FileSystemModel.cs 18 Jul 2005 15:50:59 -0000 1.24 +++ beagled/FileSystemQueryable/FileSystemModel.cs 20 Jul 2005 15:56:54 -0000 @@ -121,7 +121,8 @@ namespace Beagle.Daemon.FileSystemQuerya lock (big_lock) { return state == State.Dirty || state == State.Unknown - || state == State.PossiblyClean; + || state == State.PossiblyClean + || state == State.Unscanned; } } } @@ -186,7 +187,7 @@ namespace Beagle.Daemon.FileSystemQuerya } } - protected int CompareTo_Unlocked (object obj) + public int CompareTo_Unlocked (object obj) { Directory other = obj as Directory; if (other == null) @@ -239,8 +240,6 @@ namespace Beagle.Daemon.FileSystemQuerya private class DirectoryPrivate : Directory { - public bool NeedsFinalWatches = true; - public DirectoryPrivate (object big_lock) : base (big_lock) { @@ -353,36 +352,6 @@ namespace Beagle.Daemon.FileSystemQuerya children [new_child.Name] = new_child; } - public Directory SearchForNextToCrawl_Unlocked (Directory candidate) - { - if (this.NeedsCrawl && (candidate == null || this.CompareTo_Unlocked (candidate) > 0)) - candidate = this; - if (this.children != null) { - foreach (DirectoryPrivate subdir in this.children.Values) - candidate = subdir.SearchForNextToCrawl_Unlocked (candidate); - } - return candidate; - } - - public void CountUncrawled_Unlocked (ref int uncrawled, ref int dirty) - { - if (NeedsCrawl) { - ++uncrawled; - if (state == State.Dirty) - ++dirty; - } - - if (this.children != null) { - foreach (DirectoryPrivate subdir in this.children.Values) { - int child_uncrawled = 0; - int child_dirty = 0; - subdir.CountUncrawled_Unlocked (ref child_uncrawled, ref child_dirty); - uncrawled += child_uncrawled; - dirty += child_dirty; - } - } - } - public void PutDirectoriesInArray_Unlocked (ArrayList array) { if (NeedsCrawl) @@ -414,12 +383,10 @@ namespace Beagle.Daemon.FileSystemQuerya ArrayList roots = new ArrayList (); Hashtable path_cache = new Hashtable (); Hashtable by_unique_id = new Hashtable (); - Queue to_be_scanned = new Queue (); + ArrayList to_be_crawled = new ArrayList (); FileNameFilter filter; IFileEventBackend event_backend; - int needs_crawl_count = 0; - int block_activity = 0; UniqueIdStore unique_id_store; NameIndex name_index; @@ -521,7 +488,6 @@ namespace Beagle.Daemon.FileSystemQuerya root.SetFromFileAttributes (attr); unique_id_store.AddRoot (attr.UniqueId, path, true); - bool fire_scan_event = false; lock (big_lock) { // FIXME: We also should make sure the path is not a parent or child // of any existing root. @@ -531,23 +497,14 @@ namespace Beagle.Daemon.FileSystemQuerya Logger.Log.Debug ("Adding root {0}", path); roots.Add (root); - to_be_scanned.Enqueue (root); - if (to_be_scanned.Count == 1) - fire_scan_event = true; + to_be_crawled.Add (root); path_cache [root.FullName] = root; } - if (fire_scan_event) { - if (NeedsScanEvent != null) { - NeedsScanEvent (this); - } else { - // If nothing else is listening for this event, - // just do it ourself. - ScanAll (); - } - } - + ScanSubdirs (root); + EnsureWatched (root); + NeedsCrawlEvent (this); return root; } @@ -805,116 +762,54 @@ namespace Beagle.Daemon.FileSystemQuerya } /////////////////////////////////////////////////////////////////////////// - - public delegate void NeedsScanHandler (FileSystemModel source); - public event NeedsScanHandler NeedsScanEvent; - - public bool NeedsScan { - get { lock (big_lock) { return to_be_scanned.Count > 0; } } - } - - private void ScanOne_Unlocked (Directory dir) + + public bool ScanSubdirs (Directory dir) { - DirectoryPrivate priv = (DirectoryPrivate) dir; - - if (dir.State == State.Unscanned && event_backend != null) - priv.SetWatchHandle (event_backend.WatchDirectories (priv.FullName)); + bool ret = true; + lock (big_lock) { + DirectoryPrivate priv = (DirectoryPrivate) dir; - Hashtable known_children = null; - if (dir.State != State.Unscanned) { - known_children = new Hashtable (); - foreach (Directory kid in dir.Children) - known_children [kid.Name] = true; - } + ArrayList known_children = null; + if (dir.State != State.Unscanned) + known_children = new ArrayList (dir.Children); + + System.IO.DirectoryInfo info = new System.IO.DirectoryInfo (priv.FullName); + + // It's the call to GetDirectoryInfos() that may + // trigger the exception caught below. + try { + foreach (System.IO.DirectoryInfo subinfo in DirectoryWalker.GetDirectoryInfos (info)) { + if (known_children != null) + known_children.Remove (subinfo.Name); - System.IO.DirectoryInfo info = new System.IO.DirectoryInfo (priv.FullName); + if (Ignore (subinfo.FullName)) + continue; - // It's the call to GetDirectoryInfos() that may - // trigger the exception caught below. - try { - foreach (System.IO.DirectoryInfo subinfo in DirectoryWalker.GetDirectoryInfos (info)) { - if (! Ignore (subinfo.FullName)) { Directory child = priv.GetChildByName (subinfo.Name); // We don't know about the child, in which case we need to add it - // (AddChild_Unlocked adds it to to_be_scanned) if (child == null) AddChild_Unlocked (priv, subinfo.Name); // Or, we already know about the child, but we might want to scan it anyway - else if (child.NeedsCrawl) - to_be_scanned.Enqueue (child); + else if (child.NeedsCrawl && !to_be_crawled.Contains (child)) + to_be_crawled.Add (child); } - if (known_children != null) - known_children.Remove (subinfo.Name); + } catch (System.IO.DirectoryNotFoundException e) { + Logger.Log.Warn ("Skipping over {0}: {1}", priv.FullName, e.Message); + Delete (dir); + ret = false; } - } catch (System.IO.DirectoryNotFoundException e) { - Logger.Log.Warn ("Skipping over {0}: {1}", priv.FullName, e.Message); - } - - if (known_children != null) { - foreach (string lost_child_name in known_children.Keys) { - Directory lost_child = priv.GetChildByName (lost_child_name); - Delete (lost_child); - } - } - //if (dir.State == State.Unscanned) - //priv.SetWatchHandle (event_backend.WatchFiles (priv.FullName, priv.WatchHandle)); - - // If the LastWriteTime is more recent than the LastCrawlTime, we - // know that a file was added to or deleted from that directory, - // so we mark it as dirty. - // Otherwise we can't be sure if anything changed in that directory, - // so we mark it as unknown. - if (info.LastWriteTime > dir.LastCrawlTime) - priv.SetState (State.Dirty); - else - priv.SetState (State.Unknown); - - ++needs_crawl_count; - } - - public void ScanAll () - { - Stopwatch sw = new Stopwatch (); - sw.Start (); - - ArrayList need_watches = new ArrayList (); - - int count = 0; - bool fire_crawl_event = false; - lock (big_lock) { - int old_needs_crawl_count = needs_crawl_count; - while (to_be_scanned.Count > 0) { - if (Shutdown.ShutdownRequested) { - Logger.Log.Debug ("Bailing out of subdir scan -- shutdown requested"); - return; + if (known_children != null) { + foreach (string lost_child_name in known_children) { + Directory lost_child = priv.GetChildByName (lost_child_name); + Delete (lost_child); } - - Directory dir; - dir = to_be_scanned.Dequeue () as Directory; - ScanOne_Unlocked (dir); - need_watches.Add (dir); - ++count; } - - foreach (DirectoryPrivate priv in need_watches) { - if (priv.NeedsFinalWatches) { - if (event_backend != null) - priv.SetWatchHandle (event_backend.WatchFiles (priv.FullName, priv.WatchHandle)); - priv.NeedsFinalWatches = false; - } - } - - if (old_needs_crawl_count == 0 && needs_crawl_count > 0) - fire_crawl_event = true; } - - if (fire_crawl_event && NeedsCrawlEvent != null) - NeedsCrawlEvent (this); - Logger.Log.Debug ("Scanned {0} subdirs in {1}", count, sw); + return ret; } /////////////////////////////////////////////////////////////////////////// @@ -923,33 +818,21 @@ namespace Beagle.Daemon.FileSystemQuerya public event NeedsCrawlHandler NeedsCrawlEvent; - public bool NeedsCrawl { - get { return needs_crawl_count > 0; } - } - - // FIXME: This is inefficient, since we need to walk the entire data structure - // to find the next directory to crawl. public Directory GetNextDirectoryToCrawl () { Directory next_to_crawl = null; lock (big_lock) { - if (needs_crawl_count == 0) - return null; - foreach (DirectoryPrivate root in roots) - next_to_crawl = root.SearchForNextToCrawl_Unlocked (next_to_crawl); + foreach (DirectoryPrivate candidate in to_be_crawled) + if (next_to_crawl == null || candidate.CompareTo_Unlocked (next_to_crawl) > 0) + next_to_crawl = candidate; } - + return next_to_crawl; } - public void GetUncrawledCounts (out int uncrawled, out int dirty) + public int GetUncrawledCounts () { - uncrawled = 0; - dirty = 0; - lock (big_lock) { - foreach (DirectoryPrivate root in roots) - root.CountUncrawled_Unlocked (ref uncrawled, ref dirty); - } + return to_be_crawled.Count; } public ICollection GetAllDirectories () @@ -969,26 +852,21 @@ namespace Beagle.Daemon.FileSystemQuerya DirectoryPrivate priv = (DirectoryPrivate) dir; lock (big_lock) { + to_be_crawled.Remove (priv); if (! priv.NeedsCrawl) return; - priv.SetLastCrawlTime (crawl_time); + // FIXME: What if the directory changes between now and the // crawl time... there is a race here. - if (priv.IsWatched) { - priv.SetState (State.Clean); - --needs_crawl_count; - } else { - // Re-scan post-crawl - ScanOne_Unlocked (priv); - if (priv.NeedsFinalWatches) { - if (event_backend != null) - priv.SetWatchHandle (event_backend.WatchFiles (priv.FullName, priv.WatchHandle)); - priv.NeedsFinalWatches = false; - } + priv.SetLastCrawlTime (crawl_time); + if (priv.IsWatched) + // If we've crawled while having watches in place, the + // directory is definately clean + priv.SetState (State.Clean); + else // Unwatched directory can never be clean priv.SetState (State.PossiblyClean); - } FileAttributes attr = backing_store.ReadOrCreate (priv.FullName); attr.LastIndexedTime = priv.LastCrawlTime; @@ -1004,8 +882,7 @@ namespace Beagle.Daemon.FileSystemQuerya public void ReportActivity (Directory dir) { lock (big_lock) { - if (block_activity == 0) - ((DirectoryPrivate) dir).ReportActivity (); + ((DirectoryPrivate) dir).ReportActivity (); } } @@ -1024,13 +901,34 @@ namespace Beagle.Daemon.FileSystemQuerya public void SetAllToUnknown () { lock (big_lock) { - foreach (DirectoryPrivate root in roots) { + foreach (DirectoryPrivate root in roots) root.SetAllToUnknown_Unlocked (); - to_be_scanned.Enqueue (root); - } } - ScanAll (); + NeedsCrawlEvent (this); + } + + // Create watches if not already present + public void EnsureWatched (Directory dir) + { + lock (big_lock) { + if (dir.IsWatched) + return; + + DirectoryPrivate priv = (DirectoryPrivate) dir; + priv.SetWatchHandle (event_backend.CreateWatch (priv.FullName)); + } + } + + public void DropOpenWatch (Directory dir) + { + lock (big_lock) { + if (dir.IsWatched) + return; + + DirectoryPrivate priv = (DirectoryPrivate) dir; + priv.SetWatchHandle (event_backend.DropOpenWatch (priv.FullName, priv.WatchHandle)); + } } /////////////////////////////////////////////////////////////////////////// @@ -1065,24 +963,18 @@ namespace Beagle.Daemon.FileSystemQuerya FileAttributes attr = backing_store.ReadOrCreate (child.FullName); child.SetFromFileAttributes (attr); unique_id_store.Add (child.UniqueId, parent.UniqueId, child.Name, true); - - to_be_scanned.Enqueue (child); - + to_be_crawled.Add (child); + EnsureWatched (child); } public void AddChild (Directory parent, string child_name) { DirectoryPrivate priv = (DirectoryPrivate) parent; - bool fire_scan_event = false; lock (big_lock) { AddChild_Unlocked (priv, child_name); - if (to_be_scanned.Count == 1) - fire_scan_event = true; } - if (fire_scan_event && NeedsScanEvent != null) - NeedsScanEvent (this); } - + public void Delete (Directory dir) { DirectoryPrivate priv = (DirectoryPrivate) dir; @@ -1090,6 +982,7 @@ namespace Beagle.Daemon.FileSystemQuerya lock (big_lock) { by_unique_id.Remove (priv.UniqueId); unique_id_store.Drop (priv.UniqueId); + to_be_crawled.Remove (priv); RecursivelyRemoveFromPathCache_Unlocked (priv); priv.Detatch_Unlocked (); Index: beagled/FileSystemQueryable/FileSystemQueryable.cs =================================================================== RCS file: /cvs/gnome/beagle/beagled/FileSystemQueryable/FileSystemQueryable.cs,v retrieving revision 1.58 diff -u -B -p -r1.58 FileSystemQueryable.cs --- beagled/FileSystemQueryable/FileSystemQueryable.cs 5 Jul 2005 21:12:05 -0000 1.58 +++ beagled/FileSystemQueryable/FileSystemQueryable.cs 20 Jul 2005 15:56:55 -0000 @@ -49,6 +49,7 @@ namespace Beagle.Daemon.FileSystemQuerya private IFileEventBackend event_backend; private FileSystemModel model; + private CrawlTask last_crawl_task; public FileSystemQueryable () : base ("FileSystemIndex", MINOR_VERSION) { @@ -75,7 +76,6 @@ namespace Beagle.Daemon.FileSystemQuerya // The FileSystemModel also implements IFileAttributesStore. model = new FileSystemModel (IndexDirectory, index_fingerprint, event_backend); - model.NeedsScanEvent += new FileSystemModel.NeedsScanHandler (OnModelNeedsScan); model.NeedsCrawlEvent += new FileSystemModel.NeedsCrawlHandler (OnModelNeedsCrawl); SetUriRemappers (new LuceneDriver.UriRemapper (model.ToInternalUri), @@ -299,13 +299,12 @@ namespace Beagle.Daemon.FileSystemQuerya // launch a crawling task. private void OnModelNeedsCrawl (FileSystemModel source) { - CrawlTask task = new CrawlTask (this); - ThisScheduler.Add (task, Scheduler.AddType.DeferToExisting); - } + // We only ever want one crawling task + if (last_crawl_task != null && last_crawl_task.Active) + return; - private void OnModelNeedsScan (FileSystemModel source) - { - source.ScanAll (); + last_crawl_task = new CrawlTask (this); + ThisScheduler.Add (last_crawl_task, Scheduler.AddType.DeferToExisting); } public void StartWorker () Index: beagled/FileSystemQueryable/FileSystemWatcherBackend.cs =================================================================== RCS file: /cvs/gnome/beagle/beagled/FileSystemQueryable/FileSystemWatcherBackend.cs,v retrieving revision 1.6 diff -u -B -p -r1.6 FileSystemWatcherBackend.cs --- beagled/FileSystemQueryable/FileSystemWatcherBackend.cs 5 Jul 2005 21:12:05 -0000 1.6 +++ beagled/FileSystemQueryable/FileSystemWatcherBackend.cs 20 Jul 2005 15:56:55 -0000 @@ -48,7 +48,7 @@ namespace Beagle.Daemon.FileSystemQuerya to_be_watched [Path.Combine (PathFinder.HomeDir, "Documents")] = true; } - public object WatchDirectories (string path) + public object CreateWatch (string path) { if (! to_be_watched.Contains (path)) return null; @@ -69,8 +69,9 @@ namespace Beagle.Daemon.FileSystemQuerya return fsw; } - public object WatchFiles (string path, object old_handle) + public object DropOpenWatch (string path, object old_handle) { + // Do nothing, FileSystemWatcher doesn't monitor Open events return old_handle; } Index: beagled/FileSystemQueryable/IFileEventBackend.cs =================================================================== RCS file: /cvs/gnome/beagle/beagled/FileSystemQueryable/IFileEventBackend.cs,v retrieving revision 1.2 diff -u -B -p -r1.2 IFileEventBackend.cs --- beagled/FileSystemQueryable/IFileEventBackend.cs 5 Jul 2005 21:12:05 -0000 1.2 +++ beagled/FileSystemQueryable/IFileEventBackend.cs 20 Jul 2005 15:56:55 -0000 @@ -28,8 +28,8 @@ namespace Beagle.Daemon.FileSystemQuerya public interface IFileEventBackend { - object WatchDirectories (string path); - object WatchFiles (string path, object dir_watch_handle); + object CreateWatch (string path); + object DropOpenWatch (string path, object dir_watch_handle); bool ForgetWatch (object watch_handle); void Start (FileSystemQueryable queryable); Index: beagled/FileSystemQueryable/InotifyBackend.cs =================================================================== RCS file: /cvs/gnome/beagle/beagled/FileSystemQueryable/InotifyBackend.cs,v retrieving revision 1.12 diff -u -B -p -r1.12 InotifyBackend.cs --- beagled/FileSystemQueryable/InotifyBackend.cs 6 Jul 2005 22:54:08 -0000 1.12 +++ beagled/FileSystemQueryable/InotifyBackend.cs 20 Jul 2005 15:56:56 -0000 @@ -37,11 +37,17 @@ namespace Beagle.Daemon.FileSystemQuerya FileSystemQueryable queryable; - public object WatchDirectories (string path) + public object CreateWatch (string path) { object watch = null; try { - watch = Inotify.Subscribe (path, OnInotifyEvent, Inotify.EventType.Create); + watch = Inotify.Subscribe (path, OnInotifyEvent, Inotify.EventType.Create + | Inotify.EventType.Open + | Inotify.EventType.Delete + | Inotify.EventType.CloseWrite + | Inotify.EventType.MovedFrom + | Inotify.EventType.MovedTo); + } catch (IOException) { // We can race and files can disappear. No big deal. @@ -49,12 +55,16 @@ namespace Beagle.Daemon.FileSystemQuerya return watch; } - public object WatchFiles (string path, object old_handle) + // Modify the watch to not listen for the Open event + // The Open event is only useful pre-crawl for bumping directories + // up the crawling queue. Once we are crawling them, we don't care about + // open events anymore, so we drop that to reduce the chances of an + // inotify queue overflow. + public object DropOpenWatch (string path, object old_handle) { Inotify.Watch watch = (Inotify.Watch) old_handle; try { - watch.ChangeSubscription (Inotify.EventType.Open - | Inotify.EventType.Create + watch.ChangeSubscription (Inotify.EventType.Create | Inotify.EventType.Delete | Inotify.EventType.CloseWrite | Inotify.EventType.MovedFrom