rsrcidx.coffee

Jump To … +

server.coffee src/actionknob.coffee src/autosem.coffee src/bitbucket_kba.coffee src/browserlog.coffee src/datareduction.coffee src/dci.coffee src/dciknob.coffee src/deeseeeye.coffee src/dnd.coffee src/doof.coffee src/formurla-mngr.coffee src/fractalpanel.coffee src/fractalpanel_test.coffee src/front.coffee src/ingestor.coffee src/kbabitbucket.coffee src/knobctrl.coffee src/lib_test.coffee src/nanoclock.coffee src/noodb.coffee src/noodbabstract.coffee src/noodbbrowser.coffee src/noodbbrowser_test.coffee src/noodbsec.coffee src/noorauth.coffee src/noorplugin.coffee src/noorquery.coffee src/noorvm.coffee src/noorwrite.coffee src/quadparser.coffee src/quadparsern3.coffee src/rbac.coffee src/reactor.coffee src/rebase.coffee src/rsrcidx.coffee src/sandboxactions.coffee src/screen_ctx.coffee src/spogi.coffee src/tabular_widget.coffee src/visctrl.coffee src/voicesknob.coffee src/whowhen.coffee src/xsd2native.coffee

rsrcidx.coffee

RdfObject = require('./quadparser').RdfObject
try
  NoiceTrie = require('./noicetrie').NoiceTrie
catch e
  NoiceTrie = require('../lib/noicetrie').NoiceTrie

SOME ENUMERATED TYPES

LITERAL = 1
URI = 2
SAFE_CURIE = 3
CURIE = 4
BLANK = 5
BLANK_AS_CURIE = 6

TYPS = []  # give names to each of the TYP numbers
TYPS[LITERAL] = 'LITERAL'
TYPS[URI] = 'URI'
TYPS[SAFE_CURIE] = 'SAFE_CURIE'
TYPS[CURIE] = 'CURIE'
TYPS[BLANK] = 'BLANK'
TYPS[BLANK_AS_CURIE] = 'BLANK_AS_CURIE' # stored as a CURIE but repr() as BLANK

see get_canonical_typ() define the TYP that each different TYP ought to be stored as

TYP_2_CANONICAL_TYP = {}                     # default: URI
TYP_2_CANONICAL_TYP[LITERAL] = LITERAL       # ie 1: LITERAL
TYP_2_CANONICAL_TYP[SAFE_CURIE] = CURIE      # ie 3: CURIE
TYP_2_CANONICAL_TYP[CURIE] = CURIE           # ie 4: CURIE
TYP_2_CANONICAL_TYP[BLANK] = BLANK_AS_CURIE  # ie 5: CURIE

TYP_2_CANONICAL_TYP[BLANK_AS_CURIE] = CURIE # ie 5: CURIE

TYP_2_CANONICAL_TYP.http = CURIE
TYP_2_CANONICAL_TYP.https = CURIE
TYP_2_CANONICAL_TYP.ftp = CURIE
TYP_2_CANONICAL_TYP.file = URI

SAFE_CURIE_REGEX = /\[(.*)\:(.*)\]/
DOMAIN_NAME_REGEX = /^http[s]*:\/\/(.*)\/.*/
BARE_URI_REGEX = /^.+:\/\/([^\/]*)\/?$/
FILE_URI_REGEX = /^file:\/\/\/(.*)$/

blanks_docs_md = """
A 'blank' url should end up with an internal prefix which unites all the
blank urls within that same import operation.  This concept should be
closely examined because ideally the 'same blank url' across two imports
would be recognized as such and have the same internal representation.
Actually NO! They can not be trusted to be the same but an assertion of their
equivalence can be made (and disputed if need be) so they should actually be
different -- in other words each occasion the same file (with different contents)
is imported the blank uris should get a new BLANK_PREFIX.

So is there a relationship between the BLANK_PREFIX and the 'graph url'?
Each time an RDF document is retrieved at some URL, that URL becomes the
fourth 'graph' term of the quads

Proposals:
  Base the prefix for blanks on the prefix for the graph the blank appears in.
  ========

filename: booger.ttl

    _:one a _:whatzit .
  ========
  In this case the prefix for the file would be booger: and so
  booger_:one a booger_:whatzit .

 TODO problems to solve:
    * how to access all the blank urls associated with some importation of a document?
    * how to access all the different imports of a particular graph?
    * how to record the unauthenticated authorship of spogis from remote urls
    * how to discover the blank_prefix for a session
    * hot to deal with graphs which are themselves named with a blank node

TODO evolve these concepts until they are coherently implementable
    * each distinct occasion (ImportEvent) a graph is imported a new unique blank prefix will be used
    * and every triple will really be a quad, if you cannot afford a graph term, one will be provided for you
    * and if an ImportEvent is not the context in which the allegation happened then a per-session-graph-unique prefix will be used
    * AKA each graph alleged about in a session will have a unique blank_prefix generated for it
    * and note that the last-change-date HTTP header will provide the WHEN term

    * RDF files containing triples will use the their source URL as the
      'graph term' of resultant quads
    * Quads will keep their 'graph term'
        * TODO figure out how to protect against spoofing
    * the synthetic prefix for the 'graph term' should become the prefix for the blank
      urls in that file

    * the allegations for a session will use a unique blank_prefix
    * and each file import operation will use a unique blank_prefix

    * create ImportEvent instances for each remote document loading event, with properties
        * who requested
        * HTTP timestamp offered by source server TODO which header?
        * SHA-2 hash of entire file
        * the unique prefix for blank urls
        * the URL imported
        * the request headers?
    * stash ImportEvent instances in a graph with the local parts being the SHA-2 hash FIXME or the unique prefix for blank urls?
    * ImportEvent benefit from the nature of SPOG ids (the 'ID term') being cryptographically derived from the SPOG


So the next question is what to do when the 'graph url' is empty or a blank url.
What the heck does a blank 'context or graph url' even mean? Of course that is
up to the author.  The question is whether they are legal -- Shawn suspects they
are (TODO check!).

"""

isLiteral = (str) ->

Do not sanitize str because it should be a string, if it is not a string – explode

  if typeof str isnt 'string'
    if str.constructor.name isnt 'String'
      throw "isLiteral(str) expects str to be a string, but it was #{typeof str }"

m = str.match(/^[-\”+\d]/) m = str.match(‘^[\”\d]’) console.log “match”, m, str FIXME is a single-quote a legal way to start a string in .rdf, .ttl, .nq etc

  retval = str.match(/^[\-\"\+\d]/) and true or false # In other words, return true iff str[0] in '"+-0123456789'
  if /YAGO3/.exec(str) and not retval
    console.log('YAGO3 MATCH!!!! should return true but will return', retval, "first char is: <#{str[0]}>")
  return retval

https://themify.me/docs/extending-allowed-url-protocols Contains a good list of uri schemes AKA protocols https://developer.apple.com/library/content/featuredarticles/iPhoneURLScheme_Reference/SMSLinks/SMSLinks.html sms: https://en.wikipedia.org/wiki/XMPP xmpp: aka Jabber

STANDARD_PROTOCOLS_REGEX = new RegExp('^(http|https|ftp|ftps|file|mailto|geo|news|isbn|irc|gopher|tel|fax|xmpp|sms):', 'i')

CURIE_REGEX = new RegExp('^\w+\:.+')

curie_to_safe_curie = (curie) ->
  return "[#{curie}]"

safe_curie_to_curie = (safe_curie) ->
  if safe_curie.length > 2
    return safe_curie.substr(1, safe_curie.length - 2)
  throw new Error("safe_curie_to_curie() expects a safe_curie, unlike '#{safe_curie}'")

getTyp = (str) ->
  if str.TYP
    return str.TYP
  if str.startsWith('"_:') or str.startsWith('_:')
    return BLANK
  if isLiteral(str)
    return LITERAL
  if str.startsWith('[') and str.endsWith(']')
    return SAFE_CURIE
  if str.match(STANDARD_PROTOCOLS_REGEX)
    return URI
  return CURIE

class QueryForListenersInProgress
  constructor: (@q4l) ->
    @next_term_idx = 0
  getNextTerm: ->
    retval = @q4l.query[@next_term_idx]
    if @next_term_idx is @q4l.query.length
      @next_term_idx
    @next_term_idx++

class PrefixDb
  constructor: () ->
    @prefixes = new NoiceTrie()
  add_prefix: (k, v) ->
    try
      if not k # eg a TTL line like """@prefix : <http://www.w3.org/ns/prov#> ."""
        k = @synthesize_prefix(v)
      @prefixes.add(k, v)
    catch e

¶

TODO handle the situation where a prefix has already been defined as something else
```
      old_v = @prefixes.getValue(k)
      if old_v isnt v
```

console.log(@prefixes.tree())

        msg = "prefix #{k}: <#{v}> has already been defined as <#{old_v}> node_count:#{@prefixes.node_count}"

throw new Error(msg)

        new_k = @synthesize_non_colliding_prefix(k, v, old_v)
        @add_prefix(new_k, v)

throw e

  dump: (msg) ->
    console.log(msg? and "dump(\"#{msg}\")" or "dump()")

console.log @prefixes.tree()

    for k, v of @prefixes.k2leaf
      console.log "  #{k}: #{v.getValue()}"
  synthesize_prefix: () ->
    if not @synth_count
      @synth_count = 1
    @synth_count++
    return 'SYN'+@synth_count
  synthesize_non_colliding_prefix: (k, v, existing_v) ->

create a new prefix based on prefix which does not collide with the already existing

    return k + @synthesize_prefix()
  make_key: (to_abbrev, retval, noice_trie) ->
    if not @prefixes?
      throw new Error("make_key() called unbound")

if abbrev failed to find a key, make_key is called retval is passed to make_key so custom implementations can override it

    if retval
      return retval
    candidate_match = to_abbrev.match(DOMAIN_NAME_REGEX)
    if candidate_match
      prfx = @synthesize_prefix(candidate_match)
      while noice_trie.has_k(prfx)
        prfx = @synthesize_prefix(candidate_match)
      return prfx # TODO improve algorithm for synthesizing prefixes
      dn = candidate_match[1].toLowerCase()
      candidate_voweless = dn.replace(new RegExp('[aeiou]','g'),'')

console.log(‘to_abbrev’, to_abbrev, ‘candidate_voweless’, candidate_voweless)

      candidate_dotless = candidate_voweless.replace('.','')
      candidate_wwwless = candidate_dotless.replace('www','')
      candidate_tldless = candidate_wwwless.replace(/(com|net|org|mil|edu)$/, '')
      candidates = [candidate_tldless, candidate_wwwless, candidate_dotless]
      for candidate in candidates
        if candidate.length < 3
          continue
        if not noice_trie.has_k(candidate)
          return candidate
      throw new Error("could not make_key(#{candidates.join('|')}) for #{to_abbrev} #{@prefixes.tree()}")
    if to_abbrev.match(FILE_URI_REGEX)
      return @synthesize_prefix(to_abbrev)
    return "BADKEY"
  make_cb_for_make_key: ->
    return (a,b,c) =>
      @make_key(a,b,c)
  toSafeCURIE: (uri_or_curie, make_key) ->

THIS is where a Trie is tempting because:

we want to search

    typ = getTyp(uri_or_curie)
    if typ is SAFE_CURIE
      return uri_or_curie
    if typ is CURIE
      return curie_to_safe_curie(uri_or_curie)

URI then!

    return curie_to_safe_curie(@prefixes.abbrev(uri_or_curie, make_key or @make_cb_for_make_key()))
  toCURIE: (uri_or_curie, context, make_key) ->
    if typeof(context) is 'function'
      console.log("toCURIE() is being called with the make_key arg in the wrong position")
      process.exit()
    typ = getTyp(uri_or_curie)
    if typ is SAFE_CURIE
      return safe_curie_to_curie(uri_or_curie)
    if typ is CURIE
      return uri_or_curie

URI then!

    return @prefixes.abbrev(uri_or_curie, make_key or @make_cb_for_make_key())
  fromSafeCURIE: (safe_curie) -> # eg '[dc:title]'
    [ignore, prefix, local_part] = safe_curie.match(SAFE_CURIE_REGEX)
    expansion = @prefixes.getValue(prefix)
    if not expansion?
      if prefix is '_'
        expansion = prefix
      else
        throw new Error("no prefix found for #{safe_curie}")
    if not local_part?
      throw new Error("uh no local_part found in #{safe_curie}")
    return  expansion + local_part
  fromCURIE: (curie) -> # eg 'dc:title'
    [prefix, local_part] = curie.split(':')
    expansion = @prefixes.getValue(prefix)
    if not expansion?

@dump(“fromCURIE() failing to find ‘#{prefix}’”)

      if prefix is '_' # TODO add TYP called BLANK for when prefix is underscore
        expansion = '_:'
      else
        throw new Error("no prefix found for #{curie} in @prefixes with #{@prefixes.node_count} nodes")
    if not local_part?
      throw new Error("uh no local_part found in #{curie}")
    return  expansion + local_part
  XXXaddPrefix: (prefix, expansion) ->
    if @prefixes[prefix]?
      throw new Error("prefix #{prefix} already exists")
    @prefixes[prefix] = expansion
  canonicalize_BLANK: (blank_uri, context) ->
    prefix = context.blank_prefix
    if not prefix
      prefix = @getOrCreate_prefix_for_blanks_in_ctx(context)
    if global.TRACE
      console.log "canonicalize_BLANK('#{blank_uri}') prefix='#{prefix}', context=", context
    if prefix is '_'
      throw new Error('BLANKS should never be stored with _ as their prefix')
    return prefix + blank_uri.substr(1) # replace the leading _ with the prefix
  getOrCreate_prefix_for_blanks_in_ctx: (context) ->

¶

Purpose: Provide a way to have prefixes which define a separate namespace for the BLANK nodes of each RDF document (and each time it was retrieved). Examples: (‘http://gov.ca/stats.ttl', ‘2017-07-02T20:58:34’) ==> ‘SYN123.1’ this would hold if this was the first retrieval of the ctx_uri AND SYN123 was the prefix for ctx_uri (‘http://gov.ca/stats.ttl', ‘2017-07-02T21:15:34’) ==> ‘SYN123.2’ this would hold if this was the second retrieval of the ctx_uri, etc

prepend a special something to uri to make an uri which signifies prx4bl4uri, ie ‘prefix for BLANKs for the URI’ if there is already a prefix for bl4uri return it, else make it OGHAM FEATHER MARK (U+169B, Ps): ᚛ OGHAM REVERSED FEATHER MARK (U+169C, Pe): ᚜ Reserve the possibility of putting a timestamp between the feather marks eg ‘᚜2017-07-02T20:22:00᚛’ signifying that the file was retrieved at that time

    special_something = "᚜#{context.timestamp or ''}᚛"
    uri_for_blanks = context.uri + special_something
    ctx_uri_key = @getOrCreate_prefix_for(context.uri)
    prefix_for_blanks = ctx_uri_key + '.1'
    @add_prefix(prefix_for_blanks, uri_for_blanks)
    context.blank_prefix = prefix_for_blanks
    return prefix_for_blanks
  getOrCreate_prefix_for: (uri) ->
    uri? or throw new Error('uri must be defined')
    return @prefixes.find_or_make_key_for(uri, @make_cb_for_make_key())

class RsrcDb
  constructor: (@prefixdb) ->
    @mbrs = {}
  get_canonical_typ: (key, current_typ_or_uri_scheme) ->

¶

The canonical_typ for a key is the representation_typ it should be stored and compared in. The return value is one of the TYPs themselves: URI, SAFE_CURIE, CURIE, LITERAL instead of being the key IN that representation. The canonical_typ for URI representation in RsrcDb is CURIE, but some URI schemes (the non-DNS ones particularly) do not benefit from the compression provided by conversion to CURIE so their canonical_typ is URI. (eg geo:145.33,-23.44 OR isbn:91233123333) In other words, for such resources the URI itself is used as the key not the CURIE. CURIE is used for typical urls because it is most compact and there is no need for the ‘safety’ of SAFE_CURIEs. The case of BLANK is interesting. BLANKs (eg ‘_:boo’) are represented by CURIEs where the CURIE has a structured synthetic prefix which establishes a unique namespace for the contents of each different retrieval of the same document.

NOTE: An unsolved problem is how literals should be stored vs how TYPED literals (eg “1964-07-24^^xsd:date”) should be handled vs how typed literals with custom types (eg “3/5^^http://example.org/rationalNumber“) (how Jena does this: https://jena.apache.org/documentation/notes/typed-literals.html) console.log(“get_canonical_typ(#{JSON.stringify(key)}, #{current_typ_or_uri_scheme})”)
```
    if not current_typ_or_uri_scheme?
      current_typ_or_uri_scheme = getTyp(key) # get it if missing
    canonical_typ = TYP_2_CANONICAL_TYP[current_typ_or_uri_scheme]
    if canonical_typ is CURIE and key.match(BARE_URI_REGEX)
      canonical_typ = URI
    if not canonical_typ?
      if current_typ_or_uri_scheme is URI
        scheme = key.split(':')[0]
        if scheme
```

get the canonical_typ for uri schemes with particular ones

          canonical_typ = @get_canonical_typ(key, scheme)
      canonical_typ ?= URI
    return canonical_typ or URI
  get_canonical_form_typ_pair: (key, context) ->
    current_typ = getTyp(key)
    canonical_typ = @get_canonical_typ(key, current_typ)
    key = key.valueOf()
    if false
      try
        if key.includes('rdf-schema')
          console.log("BOO XXXXXXXXXXXXXXXXXXXXXXXXXXXX GCFTP:",key,TYPS[current_typ],'-->',TYPS[canonical_typ], key)
      catch e
        console.log(e)
    if current_typ is BLANK
      retval = [@prefixdb.canonicalize_BLANK(key, context), canonical_typ]

console.log(“current_typ: #{TYPS[current_typ]}, canonical_typ: #{TYPS[canonical_typ]}, key: #{key} retval:”, retval)

      return retval
      throw new Error("#{key} is BLANK")
    if current_typ is canonical_typ
      return [key, canonical_typ]
    if canonical_typ is CURIE
      return [@prefixdb.toCURIE(key, context), CURIE]
    if canonical_typ is URI
      if current_typ is CURIE
        return [@prefixdb.fromCURIE(key), URI]
      if current_typ is CURIE  # TODO hmmm redundant!
        return [@prefixdb.fromCURIE(key), URI]
    if canonical_typ is SAFE_CURIE
      return [@prefixdb.toSafeCURIE(key), SAFE_CURIE]
  check_isUri_and_key: (key) ->
    typ = getTyp(key)

if typ is SAFE_CURIE

#[ignore, prefix, local] = key.match(SAFE_CURIE_REGEX) key = safe_curie_to_curie(key)

    if typ isnt LITERAL
      key = @prefixdb.toCURIE(key)
      to_typ = CURIE
      if typ is BLANK
        XXXto_typ = BLANK
      return {isUri: typ isnt LITERAL, key:  @prefixdb.toCURIE(key), typ: to_typ}
    return {isUri: false, key: key, typ: LITERAL}
  get: (key, typ, context) ->
    if not typ?
      [key, typ] = @get_canonical_form_typ_pair(key, context)
      #{isUri, key, typ} = @check_isUri_and_key(key)
    if global.TRACE
      console.log "get('#{key}', #{typ})"
      for k,v of @mbrs
        console.log "  ", k
    retval = @mbrs[key]
    retval
  getOrCreate: (key, isUri, context) ->
    [canonical_key, canonical_typ] = @get_canonical_form_typ_pair(key, context)

key = key.valueOf()

    if canonical_typ > CURIE
      if canonical_key.startsWith('_')
        throw new Error(TYPS[canonical_typ]+" should not start with _")

else console.log(“HUBBA, get_canonical_form_typ_pair() is converting _ to stuff:”, canonical_key)

    isUri = not (canonical_typ is LITERAL)
    rsrc = @mbrs[canonical_key]
    if not rsrc?
      rsrc = new Rsrc(canonical_key, isUri, canonical_typ, @)
      @mbrs[canonical_key] = rsrc

if canonical_typ > CURIE console.log(rsrc.key(),”SHOULD NOT START WITH _”,rsrc.repr())

    return rsrc
  XXgetOrCreate: (key, isUri) ->
    if not isUri? # "is isUri specified?" NOT "is it true?"
      {isUri, key, typ} = @check_isUri_and_key(key)
    rsrc = @mbrs[key]
    if not rsrc?
      rsrc = new Rsrc(key, isUri, typ, @)
      @mbrs[key] = rsrc
    return rsrc
  set: (key, obj) ->
    @getOrCreate(key).addObj(obj)
  getAll: (key, typ, context) ->
    rsrc = @get(key, typ, context)
    if rsrc?
      return rsrc.all()
    return []
  getOnly: (key, typ) ->
    rsrc = @get(key, typ)
    if rsrc?
      return rsrc.first()
    return undefined
  exists: (key) ->
    if not key?
      throw new Error "exists() requires that key have a value"
    return @get(key)?
  dump: ->
    buffer = '-'.repeat(72) + "\n"
    for k,v of @mbrs
      buffer += "  #{k} = #{v.getValue()}\n"
    return buffer

class Rsrc

Each URI has one Rsrc instance built for it.

  constructor: (@_key, @isUri, @typ, @db) ->
    @occs = [] # occurrences
    @queriesForListeners = [] # this resource is the first one mentioned in these queries
  addObj: (spogi, processQueries) ->
    @occs.push(spogi)
    if processQueries
      if @queriesForListeners.length
        @queriesForListeners.forEach (q4l) ->
          q4l.sendIfSatisfies(spogi)
  addQueryForListeners: (q4l) ->
    @queriesForListeners.push(q4l)

console.log “Rsrc.addQueryForListeners(#{@key()}) @queriesForListeners.len: #{@queriesForListeners.length} q4l:”, q4l

  last: () ->
    @occs[@occs.length - 1]
  first: () ->
    @occs[0]
  all: () ->
    return @occs
  key: () ->
    @_key
  raw: () ->
    "#{@_key}"
  getNativeValue: () ->
    if not @_ntval?
      obj = new RdfObject(@_key)
      @_ntval = obj.getNativeValue()
    @_ntval
  getValue: () ->
    if not @_val?
      obj = new RdfObject(@_key)
      @_val = obj.value
    @_val
  eql: (val) ->
    retval = @_key is val
    if not retval
      console.log(@_key, '<>', val)
    return retval
  repr: ->

TODO if this Rsrc is really a value (because its the @obj) then return it properly formatted

    if @typ is BLANK_AS_CURIE
      return "<_:#{@local_part()}>"
    if @isUri
      try
        return "<#{@uri()}>"
      catch e

console.log(“repr:”, @)

        throw e
    else
      return @literal_value()
  literal_value: ->
    val = @_key
    if val.match(/\n/)
      return '"""' + val + '"""'
    return "#{val}"
  asTTL: ->
    if @isUri
      return @curie()
    return @literal_value()
  local_part: ->
    @_key.split(':')[1]
  prefix_part: ->
    @_key.split(':')[0]
  safe_curie: ->
    if @typ is SAFE_CURIE
      return @_key
    return '[#{@curie()}]'
  uri: ->
    if not @isUri
      throw new Error('#{@_key} has typ: #{@typ} and is hence not a uri')
    if @typ is SAFE_CURIE
      return @db.prefixdb.fromSafeCURIE(@_key)
    if @typ is CURIE
      return @db.prefixdb.fromCURIE(@_key)
    return @_key # by elimination, @_key is already typ URI
  curie: () ->
    if not @isUri
      throw new Error('#{@_key} has typ: #{@typ} and is hence not a uri')
    if @typ in [CURIE, BLANK_AS_CURIE]
      return @_key
    if @typ is URI
      return @db.prefixdb.toCURIE(@_key)
    if @typ is SAFE_CURIE

is SAFE_CURIE by elimination

      return safe_curie_to_curie(@_key)
    throw new Error("'#{@_key}' is of mysterious typ: #{@typ}")

(exports ? this).isLiteral = isLiteral
(exports ? this).PrefixDb = PrefixDb
(exports ? this).RsrcDb = RsrcDb
(exports ? this).Rsrc = Rsrc
(exports ? this).getTyp = getTyp
(exports ? this).LITERAL = LITERAL
(exports ? this).URI = URI
(exports ? this).SAFE_CURIE = SAFE_CURIE
(exports ? this).CURIE = CURIE
(exports ? this).BLANK = BLANK
(exports ? this).TYPS = TYPS
(exports ? this).STANDARD_PROTOCOLS_REGEX = STANDARD_PROTOCOLS_REGEX