RdfObject = require('./quadparser').RdfObject
try
NoiceTrie = require('./noicetrie').NoiceTrie
catch e
NoiceTrie = require('../lib/noicetrie').NoiceTrie
RdfObject = require('./quadparser').RdfObject
try
NoiceTrie = require('./noicetrie').NoiceTrie
catch e
NoiceTrie = require('../lib/noicetrie').NoiceTrie
SOME ENUMERATED TYPES
LITERAL = 1
URI = 2
SAFE_CURIE = 3
CURIE = 4
BLANK = 5
BLANK_AS_CURIE = 6
TYPS = [] # give names to each of the TYP numbers
TYPS[LITERAL] = 'LITERAL'
TYPS[URI] = 'URI'
TYPS[SAFE_CURIE] = 'SAFE_CURIE'
TYPS[CURIE] = 'CURIE'
TYPS[BLANK] = 'BLANK'
TYPS[BLANK_AS_CURIE] = 'BLANK_AS_CURIE' # stored as a CURIE but repr() as BLANK
see get_canonical_typ() define the TYP that each different TYP ought to be stored as
TYP_2_CANONICAL_TYP = {} # default: URI
TYP_2_CANONICAL_TYP[LITERAL] = LITERAL # ie 1: LITERAL
TYP_2_CANONICAL_TYP[SAFE_CURIE] = CURIE # ie 3: CURIE
TYP_2_CANONICAL_TYP[CURIE] = CURIE # ie 4: CURIE
TYP_2_CANONICAL_TYP[BLANK] = BLANK_AS_CURIE # ie 5: CURIE
TYP_2_CANONICAL_TYP[BLANK_AS_CURIE] = CURIE # ie 5: CURIE
TYP_2_CANONICAL_TYP.http = CURIE
TYP_2_CANONICAL_TYP.https = CURIE
TYP_2_CANONICAL_TYP.ftp = CURIE
TYP_2_CANONICAL_TYP.file = URI
SAFE_CURIE_REGEX = /\[(.*)\:(.*)\]/
DOMAIN_NAME_REGEX = /^http[s]*:\/\/(.*)\/.*/
BARE_URI_REGEX = /^.+:\/\/([^\/]*)\/?$/
FILE_URI_REGEX = /^file:\/\/\/(.*)$/
blanks_docs_md = """
A 'blank' url should end up with an internal prefix which unites all the
blank urls within that same import operation. This concept should be
closely examined because ideally the 'same blank url' across two imports
would be recognized as such and have the same internal representation.
Actually NO! They can not be trusted to be the same but an assertion of their
equivalence can be made (and disputed if need be) so they should actually be
different -- in other words each occasion the same file (with different contents)
is imported the blank uris should get a new BLANK_PREFIX.
So is there a relationship between the BLANK_PREFIX and the 'graph url'?
Each time an RDF document is retrieved at some URL, that URL becomes the
fourth 'graph' term of the quads
Proposals:
Base the prefix for blanks on the prefix for the graph the blank appears in.
========
filename: booger.ttl
_:one a _:whatzit .
========
In this case the prefix for the file would be booger: and so
booger_:one a booger_:whatzit .
TODO problems to solve:
* how to access all the blank urls associated with some importation of a document?
* how to access all the different imports of a particular graph?
* how to record the unauthenticated authorship of spogis from remote urls
* how to discover the blank_prefix for a session
* hot to deal with graphs which are themselves named with a blank node
TODO evolve these concepts until they are coherently implementable
* each distinct occasion (ImportEvent) a graph is imported a new unique blank prefix will be used
* and every triple will really be a quad, if you cannot afford a graph term, one will be provided for you
* and if an ImportEvent is not the context in which the allegation happened then a per-session-graph-unique prefix will be used
* AKA each graph alleged about in a session will have a unique blank_prefix generated for it
* and note that the last-change-date HTTP header will provide the WHEN term
* RDF files containing triples will use the their source URL as the
'graph term' of resultant quads
* Quads will keep their 'graph term'
* TODO figure out how to protect against spoofing
* the synthetic prefix for the 'graph term' should become the prefix for the blank
urls in that file
* the allegations for a session will use a unique blank_prefix
* and each file import operation will use a unique blank_prefix
* create ImportEvent instances for each remote document loading event, with properties
* who requested
* HTTP timestamp offered by source server TODO which header?
* SHA-2 hash of entire file
* the unique prefix for blank urls
* the URL imported
* the request headers?
* stash ImportEvent instances in a graph with the local parts being the SHA-2 hash FIXME or the unique prefix for blank urls?
* ImportEvent benefit from the nature of SPOG ids (the 'ID term') being cryptographically derived from the SPOG
So the next question is what to do when the 'graph url' is empty or a blank url.
What the heck does a blank 'context or graph url' even mean? Of course that is
up to the author. The question is whether they are legal -- Shawn suspects they
are (TODO check!).
"""
isLiteral = (str) ->
Do not sanitize str because it should be a string, if it is not a string – explode
if typeof str isnt 'string'
if str.constructor.name isnt 'String'
throw "isLiteral(str) expects str to be a string, but it was #{typeof str }"
m = str.match(/^[-\”+\d]/) m = str.match(‘^[\”\d]’) console.log “match”, m, str FIXME is a single-quote a legal way to start a string in .rdf, .ttl, .nq etc
retval = str.match(/^[\-\"\+\d]/) and true or false # In other words, return true iff str[0] in '"+-0123456789'
if /YAGO3/.exec(str) and not retval
console.log('YAGO3 MATCH!!!! should return true but will return', retval, "first char is: <#{str[0]}>")
return retval
https://themify.me/docs/extending-allowed-url-protocols Contains a good list of uri schemes AKA protocols https://developer.apple.com/library/content/featuredarticles/iPhoneURLScheme_Reference/SMSLinks/SMSLinks.html sms: https://en.wikipedia.org/wiki/XMPP xmpp: aka Jabber
STANDARD_PROTOCOLS_REGEX = new RegExp('^(http|https|ftp|ftps|file|mailto|geo|news|isbn|irc|gopher|tel|fax|xmpp|sms):', 'i')
CURIE_REGEX = new RegExp('^\w+\:.+')
curie_to_safe_curie = (curie) ->
return "[#{curie}]"
safe_curie_to_curie = (safe_curie) ->
if safe_curie.length > 2
return safe_curie.substr(1, safe_curie.length - 2)
throw new Error("safe_curie_to_curie() expects a safe_curie, unlike '#{safe_curie}'")
getTyp = (str) ->
if str.TYP
return str.TYP
if str.startsWith('"_:') or str.startsWith('_:')
return BLANK
if isLiteral(str)
return LITERAL
if str.startsWith('[') and str.endsWith(']')
return SAFE_CURIE
if str.match(STANDARD_PROTOCOLS_REGEX)
return URI
return CURIE
class QueryForListenersInProgress
constructor: (@q4l) ->
@next_term_idx = 0
getNextTerm: ->
retval = @q4l.query[@next_term_idx]
if @next_term_idx is @q4l.query.length
@next_term_idx
@next_term_idx++
class PrefixDb
constructor: () ->
@prefixes = new NoiceTrie()
add_prefix: (k, v) ->
try
if not k # eg a TTL line like """@prefix : <http://www.w3.org/ns/prov#> ."""
k = @synthesize_prefix(v)
@prefixes.add(k, v)
catch e
TODO handle the situation where a prefix has already been defined as something else
old_v = @prefixes.getValue(k)
if old_v isnt v
console.log(@prefixes.tree())
msg = "prefix #{k}: <#{v}> has already been defined as <#{old_v}> node_count:#{@prefixes.node_count}"
throw new Error(msg)
new_k = @synthesize_non_colliding_prefix(k, v, old_v)
@add_prefix(new_k, v)
throw e
dump: (msg) ->
console.log(msg? and "dump(\"#{msg}\")" or "dump()")
console.log @prefixes.tree()
for k, v of @prefixes.k2leaf
console.log " #{k}: #{v.getValue()}"
synthesize_prefix: () ->
if not @synth_count
@synth_count = 1
@synth_count++
return 'SYN'+@synth_count
synthesize_non_colliding_prefix: (k, v, existing_v) ->
create a new prefix based on prefix which does not collide with the already existing
return k + @synthesize_prefix()
make_key: (to_abbrev, retval, noice_trie) ->
if not @prefixes?
throw new Error("make_key() called unbound")
if abbrev failed to find a key, make_key is called retval is passed to make_key so custom implementations can override it
if retval
return retval
candidate_match = to_abbrev.match(DOMAIN_NAME_REGEX)
if candidate_match
prfx = @synthesize_prefix(candidate_match)
while noice_trie.has_k(prfx)
prfx = @synthesize_prefix(candidate_match)
return prfx # TODO improve algorithm for synthesizing prefixes
dn = candidate_match[1].toLowerCase()
candidate_voweless = dn.replace(new RegExp('[aeiou]','g'),'')
console.log(‘to_abbrev’, to_abbrev, ‘candidate_voweless’, candidate_voweless)
candidate_dotless = candidate_voweless.replace('.','')
candidate_wwwless = candidate_dotless.replace('www','')
candidate_tldless = candidate_wwwless.replace(/(com|net|org|mil|edu)$/, '')
candidates = [candidate_tldless, candidate_wwwless, candidate_dotless]
for candidate in candidates
if candidate.length < 3
continue
if not noice_trie.has_k(candidate)
return candidate
throw new Error("could not make_key(#{candidates.join('|')}) for #{to_abbrev} #{@prefixes.tree()}")
if to_abbrev.match(FILE_URI_REGEX)
return @synthesize_prefix(to_abbrev)
return "BADKEY"
make_cb_for_make_key: ->
return (a,b,c) =>
@make_key(a,b,c)
toSafeCURIE: (uri_or_curie, make_key) ->
typ = getTyp(uri_or_curie)
if typ is SAFE_CURIE
return uri_or_curie
if typ is CURIE
return curie_to_safe_curie(uri_or_curie)
URI then!
return curie_to_safe_curie(@prefixes.abbrev(uri_or_curie, make_key or @make_cb_for_make_key()))
toCURIE: (uri_or_curie, context, make_key) ->
if typeof(context) is 'function'
console.log("toCURIE() is being called with the make_key arg in the wrong position")
process.exit()
typ = getTyp(uri_or_curie)
if typ is SAFE_CURIE
return safe_curie_to_curie(uri_or_curie)
if typ is CURIE
return uri_or_curie
URI then!
return @prefixes.abbrev(uri_or_curie, make_key or @make_cb_for_make_key())
fromSafeCURIE: (safe_curie) -> # eg '[dc:title]'
[ignore, prefix, local_part] = safe_curie.match(SAFE_CURIE_REGEX)
expansion = @prefixes.getValue(prefix)
if not expansion?
if prefix is '_'
expansion = prefix
else
throw new Error("no prefix found for #{safe_curie}")
if not local_part?
throw new Error("uh no local_part found in #{safe_curie}")
return expansion + local_part
fromCURIE: (curie) -> # eg 'dc:title'
[prefix, local_part] = curie.split(':')
expansion = @prefixes.getValue(prefix)
if not expansion?
@dump(“fromCURIE() failing to find ‘#{prefix}’”)
if prefix is '_' # TODO add TYP called BLANK for when prefix is underscore
expansion = '_:'
else
throw new Error("no prefix found for #{curie} in @prefixes with #{@prefixes.node_count} nodes")
if not local_part?
throw new Error("uh no local_part found in #{curie}")
return expansion + local_part
XXXaddPrefix: (prefix, expansion) ->
if @prefixes[prefix]?
throw new Error("prefix #{prefix} already exists")
@prefixes[prefix] = expansion
canonicalize_BLANK: (blank_uri, context) ->
prefix = context.blank_prefix
if not prefix
prefix = @getOrCreate_prefix_for_blanks_in_ctx(context)
if global.TRACE
console.log "canonicalize_BLANK('#{blank_uri}') prefix='#{prefix}', context=", context
if prefix is '_'
throw new Error('BLANKS should never be stored with _ as their prefix')
return prefix + blank_uri.substr(1) # replace the leading _ with the prefix
getOrCreate_prefix_for_blanks_in_ctx: (context) ->
Purpose: Provide a way to have prefixes which define a separate namespace for the BLANK nodes of each RDF document (and each time it was retrieved). Examples: (‘http://gov.ca/stats.ttl', ‘2017-07-02T20:58:34’) ==> ‘SYN123.1’ this would hold if this was the first retrieval of the ctx_uri AND SYN123 was the prefix for ctx_uri (‘http://gov.ca/stats.ttl', ‘2017-07-02T21:15:34’) ==> ‘SYN123.2’ this would hold if this was the second retrieval of the ctx_uri, etc
prepend a special something to uri to make an uri which signifies prx4bl4uri, ie ‘prefix for BLANKs for the URI’ if there is already a prefix for bl4uri return it, else make it OGHAM FEATHER MARK (U+169B, Ps): ᚛ OGHAM REVERSED FEATHER MARK (U+169C, Pe): ᚜ Reserve the possibility of putting a timestamp between the feather marks eg ‘᚜2017-07-02T20:22:00᚛’ signifying that the file was retrieved at that time
special_something = "᚜#{context.timestamp or ''}᚛"
uri_for_blanks = context.uri + special_something
ctx_uri_key = @getOrCreate_prefix_for(context.uri)
prefix_for_blanks = ctx_uri_key + '.1'
@add_prefix(prefix_for_blanks, uri_for_blanks)
context.blank_prefix = prefix_for_blanks
return prefix_for_blanks
getOrCreate_prefix_for: (uri) ->
uri? or throw new Error('uri must be defined')
return @prefixes.find_or_make_key_for(uri, @make_cb_for_make_key())
class RsrcDb
constructor: (@prefixdb) ->
@mbrs = {}
get_canonical_typ: (key, current_typ_or_uri_scheme) ->
The canonical_typ for a key is the representation_typ it should be stored and compared in. The return value is one of the TYPs themselves: URI, SAFE_CURIE, CURIE, LITERAL instead of being the key IN that representation. The canonical_typ for URI representation in RsrcDb is CURIE, but some URI schemes (the non-DNS ones particularly) do not benefit from the compression provided by conversion to CURIE so their canonical_typ is URI. (eg geo:145.33,-23.44 OR isbn:91233123333) In other words, for such resources the URI itself is used as the key not the CURIE. CURIE is used for typical urls because it is most compact and there is no need for the ‘safety’ of SAFE_CURIEs. The case of BLANK is interesting. BLANKs (eg ‘_:boo’) are represented by CURIEs where the CURIE has a structured synthetic prefix which establishes a unique namespace for the contents of each different retrieval of the same document.
NOTE: An unsolved problem is how literals should be stored vs how TYPED literals (eg “1964-07-24^^xsd:date”) should be handled vs how typed literals with custom types (eg “3/5^^http://example.org/rationalNumber“) (how Jena does this: https://jena.apache.org/documentation/notes/typed-literals.html) console.log(“get_canonical_typ(#{JSON.stringify(key)}, #{current_typ_or_uri_scheme})”)
if not current_typ_or_uri_scheme?
current_typ_or_uri_scheme = getTyp(key) # get it if missing
canonical_typ = TYP_2_CANONICAL_TYP[current_typ_or_uri_scheme]
if canonical_typ is CURIE and key.match(BARE_URI_REGEX)
canonical_typ = URI
if not canonical_typ?
if current_typ_or_uri_scheme is URI
scheme = key.split(':')[0]
if scheme
get the canonical_typ for uri schemes with particular ones
canonical_typ = @get_canonical_typ(key, scheme)
canonical_typ ?= URI
return canonical_typ or URI
get_canonical_form_typ_pair: (key, context) ->
current_typ = getTyp(key)
canonical_typ = @get_canonical_typ(key, current_typ)
key = key.valueOf()
if false
try
if key.includes('rdf-schema')
console.log("BOO XXXXXXXXXXXXXXXXXXXXXXXXXXXX GCFTP:",key,TYPS[current_typ],'-->',TYPS[canonical_typ], key)
catch e
console.log(e)
if current_typ is BLANK
retval = [@prefixdb.canonicalize_BLANK(key, context), canonical_typ]
console.log(“current_typ: #{TYPS[current_typ]}, canonical_typ: #{TYPS[canonical_typ]}, key: #{key} retval:”, retval)
return retval
throw new Error("#{key} is BLANK")
if current_typ is canonical_typ
return [key, canonical_typ]
if canonical_typ is CURIE
return [@prefixdb.toCURIE(key, context), CURIE]
if canonical_typ is URI
if current_typ is CURIE
return [@prefixdb.fromCURIE(key), URI]
if current_typ is CURIE # TODO hmmm redundant!
return [@prefixdb.fromCURIE(key), URI]
if canonical_typ is SAFE_CURIE
return [@prefixdb.toSafeCURIE(key), SAFE_CURIE]
check_isUri_and_key: (key) ->
typ = getTyp(key)
if typ is SAFE_CURIE
#[ignore, prefix, local] = key.match(SAFE_CURIE_REGEX) key = safe_curie_to_curie(key)
if typ isnt LITERAL
key = @prefixdb.toCURIE(key)
to_typ = CURIE
if typ is BLANK
XXXto_typ = BLANK
return {isUri: typ isnt LITERAL, key: @prefixdb.toCURIE(key), typ: to_typ}
return {isUri: false, key: key, typ: LITERAL}
get: (key, typ, context) ->
if not typ?
[key, typ] = @get_canonical_form_typ_pair(key, context)
#{isUri, key, typ} = @check_isUri_and_key(key)
if global.TRACE
console.log "get('#{key}', #{typ})"
for k,v of @mbrs
console.log " ", k
retval = @mbrs[key]
retval
getOrCreate: (key, isUri, context) ->
[canonical_key, canonical_typ] = @get_canonical_form_typ_pair(key, context)
key = key.valueOf()
if canonical_typ > CURIE
if canonical_key.startsWith('_')
throw new Error(TYPS[canonical_typ]+" should not start with _")
else console.log(“HUBBA, get_canonical_form_typ_pair() is converting _ to stuff:”, canonical_key)
isUri = not (canonical_typ is LITERAL)
rsrc = @mbrs[canonical_key]
if not rsrc?
rsrc = new Rsrc(canonical_key, isUri, canonical_typ, @)
@mbrs[canonical_key] = rsrc
if canonical_typ > CURIE console.log(rsrc.key(),”SHOULD NOT START WITH _”,rsrc.repr())
return rsrc
XXgetOrCreate: (key, isUri) ->
if not isUri? # "is isUri specified?" NOT "is it true?"
{isUri, key, typ} = @check_isUri_and_key(key)
rsrc = @mbrs[key]
if not rsrc?
rsrc = new Rsrc(key, isUri, typ, @)
@mbrs[key] = rsrc
return rsrc
set: (key, obj) ->
@getOrCreate(key).addObj(obj)
getAll: (key, typ, context) ->
rsrc = @get(key, typ, context)
if rsrc?
return rsrc.all()
return []
getOnly: (key, typ) ->
rsrc = @get(key, typ)
if rsrc?
return rsrc.first()
return undefined
exists: (key) ->
if not key?
throw new Error "exists() requires that key have a value"
return @get(key)?
dump: ->
buffer = '-'.repeat(72) + "\n"
for k,v of @mbrs
buffer += " #{k} = #{v.getValue()}\n"
return buffer
class Rsrc
Each URI has one Rsrc instance built for it.
constructor: (@_key, @isUri, @typ, @db) ->
@occs = [] # occurrences
@queriesForListeners = [] # this resource is the first one mentioned in these queries
addObj: (spogi, processQueries) ->
@occs.push(spogi)
if processQueries
if @queriesForListeners.length
@queriesForListeners.forEach (q4l) ->
q4l.sendIfSatisfies(spogi)
addQueryForListeners: (q4l) ->
@queriesForListeners.push(q4l)
console.log “Rsrc.addQueryForListeners(#{@key()}) @queriesForListeners.len: #{@queriesForListeners.length} q4l:”, q4l
last: () ->
@occs[@occs.length - 1]
first: () ->
@occs[0]
all: () ->
return @occs
key: () ->
@_key
raw: () ->
"#{@_key}"
getNativeValue: () ->
if not @_ntval?
obj = new RdfObject(@_key)
@_ntval = obj.getNativeValue()
@_ntval
getValue: () ->
if not @_val?
obj = new RdfObject(@_key)
@_val = obj.value
@_val
eql: (val) ->
retval = @_key is val
if not retval
console.log(@_key, '<>', val)
return retval
repr: ->
TODO if this Rsrc is really a value (because its the @obj) then return it properly formatted
if @typ is BLANK_AS_CURIE
return "<_:#{@local_part()}>"
if @isUri
try
return "<#{@uri()}>"
catch e
console.log(“repr:”, @)
throw e
else
return @literal_value()
literal_value: ->
val = @_key
if val.match(/\n/)
return '"""' + val + '"""'
return "#{val}"
asTTL: ->
if @isUri
return @curie()
return @literal_value()
local_part: ->
@_key.split(':')[1]
prefix_part: ->
@_key.split(':')[0]
safe_curie: ->
if @typ is SAFE_CURIE
return @_key
return '[#{@curie()}]'
uri: ->
if not @isUri
throw new Error('#{@_key} has typ: #{@typ} and is hence not a uri')
if @typ is SAFE_CURIE
return @db.prefixdb.fromSafeCURIE(@_key)
if @typ is CURIE
return @db.prefixdb.fromCURIE(@_key)
return @_key # by elimination, @_key is already typ URI
curie: () ->
if not @isUri
throw new Error('#{@_key} has typ: #{@typ} and is hence not a uri')
if @typ in [CURIE, BLANK_AS_CURIE]
return @_key
if @typ is URI
return @db.prefixdb.toCURIE(@_key)
if @typ is SAFE_CURIE
is SAFE_CURIE by elimination
return safe_curie_to_curie(@_key)
throw new Error("'#{@_key}' is of mysterious typ: #{@typ}")
(exports ? this).isLiteral = isLiteral
(exports ? this).PrefixDb = PrefixDb
(exports ? this).RsrcDb = RsrcDb
(exports ? this).Rsrc = Rsrc
(exports ? this).getTyp = getTyp
(exports ? this).LITERAL = LITERAL
(exports ? this).URI = URI
(exports ? this).SAFE_CURIE = SAFE_CURIE
(exports ? this).CURIE = CURIE
(exports ? this).BLANK = BLANK
(exports ? this).TYPS = TYPS
(exports ? this).STANDARD_PROTOCOLS_REGEX = STANDARD_PROTOCOLS_REGEX